From 634accf90837782720fdcd853d4b32531743ced1 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Sat, 19 Dec 2020 23:06:04 -0500 Subject: [PATCH 01/20] add trec-covid pre-built index to pyserini --- pyserini/prebuilt_index_info.py | 60 +++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index 76ac15778..7490e10ff 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -151,5 +151,65 @@ "documents": 4170312, "unique_terms": -1, "downloaded": False + }, + "trec-covid5": { + "description": "Trec Covid Round 5", + "urls": [ + "https://www.dropbox.com/s/9hfowxi7zenuaay/lucene-index-cord19-abstract-2020-07-16.tar.gz?dl=1" + ], + "md5": "c883571ccc78b4c2ce05b41eb07f5405", + "size compressed (bytes)": 2796524, + "total_terms": -1, + "documents": 18, + "unique_terms": -1, + "downloaded": False + }, + "trec-covid4": { + "description": "Trec Covid Round 4", + "urls": [ + "https://www.dropbox.com/s/x8wbuy0atgnajfd/lucene-index-cord19-abstract-2020-06-19.tar.gz?dl=1" + ], + "md5": "029bd55daba8800fbae2be9e5fcd7b33", + "size compressed (bytes)": 2584264, + "total_terms": -1, + "documents": 18, + "unique_terms": -1, + "downloaded": False + }, + "trec-covid3": { + "description": "Trec Covid Round 3", + "urls": [ + "https://www.dropbox.com/s/7bbz6pm4rduqvx3/lucene-index-cord19-abstract-2020-05-19.tar.gz?dl=1" + ], + "md5": "37bb97d0c41d650ba8e135fd75ae8fd8", + "size compressed (bytes)": 2190328, + "total_terms": -1, + "documents": 18, + "unique_terms": -1, + "downloaded": False + }, + "trec-covid2": { + "description": "Trec Covid Round 2", + "urls": [ + "https://www.dropbox.com/s/jdsc6wu0vbumpup/lucene-index-cord19-abstract-2020-05-01.tar.gz?dl=1" + ], + "md5": "a06e71a98a68d31148cb0e97e70a2ee1", + "size compressed (bytes)": 1575804, + "total_terms": -1, + "documents": 18, + "unique_terms": -1, + "downloaded": False + }, + "trec-covid1": { + "description": "Trec Covid Round 1", + "urls": [ + "https://www.dropbox.com/s/iebape2yfgkzkt1/lucene-index-covid-2020-04-10.tar.gz?dl=1" + ], + "md5": "ec239d56498c0e7b74e3b41e1ce5d42a", + "size compressed (bytes)": 1621440, + "total_terms": -1, + "documents": 18, + "unique_terms": -1, + "downloaded": False } } From 05f3722e6202ec9aafa110d99f96aadbd99ff5a6 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Sun, 20 Dec 2020 23:18:39 -0500 Subject: [PATCH 02/20] fix a bug on total_terms,documents, and unique_terms number for trec-covid pre-built index --- pyserini/prebuilt_index_info.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index 7490e10ff..f0cea840c 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -159,9 +159,9 @@ ], "md5": "c883571ccc78b4c2ce05b41eb07f5405", "size compressed (bytes)": 2796524, - "total_terms": -1, - "documents": 18, - "unique_terms": -1, + "total_terms": 22100404, + "documents": 192459, + "unique_terms": 195875, "downloaded": False }, "trec-covid4": { @@ -171,9 +171,9 @@ ], "md5": "029bd55daba8800fbae2be9e5fcd7b33", "size compressed (bytes)": 2584264, - "total_terms": -1, - "documents": 18, - "unique_terms": -1, + "total_terms": 18724353, + "documents": 158226, + "unique_terms": 179937, "downloaded": False }, "trec-covid3": { @@ -183,9 +183,9 @@ ], "md5": "37bb97d0c41d650ba8e135fd75ae8fd8", "size compressed (bytes)": 2190328, - "total_terms": -1, - "documents": 18, - "unique_terms": -1, + "total_terms": 16278419, + "documents": 128465, + "unique_terms": 168291, "downloaded": False }, "trec-covid2": { @@ -195,9 +195,9 @@ ], "md5": "a06e71a98a68d31148cb0e97e70a2ee1", "size compressed (bytes)": 1575804, - "total_terms": -1, - "documents": 18, - "unique_terms": -1, + "total_terms": 7651125, + "documents": 59873, + "unique_terms": 109750, "downloaded": False }, "trec-covid1": { @@ -207,9 +207,9 @@ ], "md5": "ec239d56498c0e7b74e3b41e1ce5d42a", "size compressed (bytes)": 1621440, - "total_terms": -1, - "documents": 18, - "unique_terms": -1, + "total_terms": 6672525, + "documents": 51069, + "unique_terms": 104595, "downloaded": False } } From 4647ba19a78ff91410a49ce4b74cf46c1efede92 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Mon, 28 Dec 2020 17:37:15 -0500 Subject: [PATCH 03/20] add full-text and paragraph pre-built index for trec covid round 1-5 --- pyserini/prebuilt_index_info.py | 165 +++++++++++++++++++++++++++++--- 1 file changed, 150 insertions(+), 15 deletions(-) diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index f0cea840c..15c88e911 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -152,10 +152,11 @@ "unique_terms": -1, "downloaded": False }, - "trec-covid5": { - "description": "Trec Covid Round 5", + "trec-covid-r5-abstract": { + "description": "TREC-COVID Round 5: abstract index", "urls": [ - "https://www.dropbox.com/s/9hfowxi7zenuaay/lucene-index-cord19-abstract-2020-07-16.tar.gz?dl=1" + "https://www.dropbox.com/s/9hfowxi7zenuaay/lucene-index-cord19-abstract-2020-07-16.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-07-16/lucene-index-cord19-abstract-2020-07-16.tar.gz" ], "md5": "c883571ccc78b4c2ce05b41eb07f5405", "size compressed (bytes)": 2796524, @@ -164,10 +165,37 @@ "unique_terms": 195875, "downloaded": False }, - "trec-covid4": { - "description": "Trec Covid Round 4", + "trec-covid-r5-full-text": { + "description": "TREC-COVID Round 5: full-text index", "urls": [ - "https://www.dropbox.com/s/x8wbuy0atgnajfd/lucene-index-cord19-abstract-2020-06-19.tar.gz?dl=1" + "https://www.dropbox.com/s/dyd9sggrqo44d0n/lucene-index-cord19-full-text-2020-07-16.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-07-16/lucene-index-cord19-full-text-2020-07-16.tar.gz" + ], + "md5": "23cfad89b4c206d66125f5736f60248f", + "size compressed (bytes)": 5351744, + "total_terms": 275238847, + "documents": 192460, + "unique_terms": 1843368, + "downloaded": False + }, + "trec-covid-r5-paragraph": { + "description": "TREC-COVID Round 5: paragraph index", + "urls": [ + "https://www.dropbox.com/s/jdfbrnohtkrvds5/lucene-index-cord19-paragraph-2020-07-16.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-07-16/lucene-index-cord19-paragraph-2020-07-16.tar.gz" + ], + "md5": "c2c6ac832f8a1fcb767d2356d2b1e1df", + "size compressed (bytes)": 11352968, + "total_terms": 627083574, + "documents": 3010497, + "unique_terms": 1843368, + "downloaded": False + }, + "trec-covid-r4-abstract": { + "description": "TREC-COVID Round 4: abstract index", + "urls": [ + "https://www.dropbox.com/s/x8wbuy0atgnajfd/lucene-index-cord19-abstract-2020-06-19.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-06-19/lucene-index-cord19-abstract-2020-06-19.tar.gz" ], "md5": "029bd55daba8800fbae2be9e5fcd7b33", "size compressed (bytes)": 2584264, @@ -176,10 +204,37 @@ "unique_terms": 179937, "downloaded": False }, - "trec-covid3": { - "description": "Trec Covid Round 3", + "trec-covid-r4-full-text": { + "description": "TREC-COVID Round 4: full-text index", "urls": [ - "https://www.dropbox.com/s/7bbz6pm4rduqvx3/lucene-index-cord19-abstract-2020-05-19.tar.gz?dl=1" + "https://www.dropbox.com/s/tf469r70r8aigu2/lucene-index-cord19-full-text-2020-06-19.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-06-19/lucene-index-cord19-full-text-2020-06-19.tar.gz" + ], + "md5": "3d0eb12094a24cff9bcacd1f17c3ea1c", + "size compressed (bytes)": 4983900, + "total_terms": 254810123, + "documents": 158227, + "unique_terms": 1783089, + "downloaded": False + }, + "trec-covid-r4-paragraph": { + "description": "TREC-COVID Round 4: paragraph index", + "urls": [ + "https://www.dropbox.com/s/fr3v69vhryevwp9/lucene-index-cord19-paragraph-2020-06-19.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-06-19/lucene-index-cord19-paragraph-2020-06-19.tar.gz" + ], + "md5": "5cd8cd6998177bed7a3e0057ef8b3595", + "size compressed (bytes)": 10382704, + "total_terms": 567579834, + "documents": 2781172, + "unique_terms": 1783089, + "downloaded": False + }, + "trec-covid-r3-abstract": { + "description": "TREC-COVID Round 3: abstract index", + "urls": [ + "https://www.dropbox.com/s/7bbz6pm4rduqvx3/lucene-index-cord19-abstract-2020-05-19.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-05-19/lucene-index-cord19-abstract-2020-05-19.tar.gz" ], "md5": "37bb97d0c41d650ba8e135fd75ae8fd8", "size compressed (bytes)": 2190328, @@ -188,10 +243,37 @@ "unique_terms": 168291, "downloaded": False }, - "trec-covid2": { - "description": "Trec Covid Round 2", + "trec-covid-r3-full-text": { + "description": "TREC-COVID Round 3: full-text index", + "urls": [ + "https://www.dropbox.com/s/bxhldgks1rxz4ly/lucene-index-cord19-full-text-2020-05-19.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-05-19/lucene-index-cord19-full-text-2020-05-19.tar.gz" + ], + "md5": "f5711915a66cd2b511e0fb8d03e4c325", + "size compressed (bytes)": 4233300, + "total_terms": 215806519, + "documents": 128465, + "unique_terms": 1620335, + "downloaded": False + }, + "trec-covid-r3-paragraph": { + "description": "TREC-COVID Round 3: paragraph index", "urls": [ - "https://www.dropbox.com/s/jdsc6wu0vbumpup/lucene-index-cord19-abstract-2020-05-01.tar.gz?dl=1" + "https://www.dropbox.com/s/2ewjchln0ihm6hh/lucene-index-cord19-paragraph-2020-05-19.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-05-19/lucene-index-cord19-paragraph-2020-05-19.tar.gz" + ], + "md5": "012ab1f804382b2275c433a74d7d31f2", + "size compressed (bytes)": 9053524, + "total_terms": 485309568, + "documents": 2297201, + "unique_terms": 1620335, + "downloaded": False + }, + "trec-covid-r2-abstract": { + "description": "TREC-COVID Round 2: abstract index", + "urls": [ + "https://www.dropbox.com/s/jdsc6wu0vbumpup/lucene-index-cord19-abstract-2020-05-01.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-05-01/lucene-index-cord19-abstract-2020-05-01.tar.gz" ], "md5": "a06e71a98a68d31148cb0e97e70a2ee1", "size compressed (bytes)": 1575804, @@ -200,10 +282,37 @@ "unique_terms": 109750, "downloaded": False }, - "trec-covid1": { - "description": "Trec Covid Round 1", + "trec-covid-r2-full-text": { + "description": "TREC-COVID Round 2: full-text index", + "urls": [ + "https://www.dropbox.com/s/ouvp7zyqsp9y9gh/lucene-index-cord19-full-text-2020-05-01.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-05-01/lucene-index-cord19-full-text-2020-05-01.tar.gz" + ], + "md5": "e7eca1b976cdf2cd80e908c9ac2263cb", + "size compressed (bytes)": 3088540, + "total_terms": 154736295, + "documents": 59876, + "unique_terms": 1214374, + "downloaded": False + }, + "trec-covid-r2-paragraph": { + "description": "TREC-COVID Round 2: paragraph index", + "urls": [ + "https://www.dropbox.com/s/e1118vjuf58ojt4/lucene-index-cord19-paragraph-2020-05-01.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-05-01/lucene-index-cord19-paragraph-2020-05-01.tar.gz" + ], + "md5": "8f9321757a03985ac1c1952b2fff2c7d", + "size compressed (bytes)": 6881696, + "total_terms": 360119048, + "documents": 1758168, + "unique_terms": 1214374, + "downloaded": False + }, + "trec-covid-r1-abstract": { + "description": "TREC-COVID Round 1: abstract index", "urls": [ - "https://www.dropbox.com/s/iebape2yfgkzkt1/lucene-index-covid-2020-04-10.tar.gz?dl=1" + "https://www.dropbox.com/s/iebape2yfgkzkt1/lucene-index-covid-2020-04-10.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-04-10/lucene-index-covid-2020-04-10.tar.gz" ], "md5": "ec239d56498c0e7b74e3b41e1ce5d42a", "size compressed (bytes)": 1621440, @@ -211,5 +320,31 @@ "documents": 51069, "unique_terms": 104595, "downloaded": False + }, + "trec-covid-r1-full-text": { + "description": "TREC-COVID Round 1: full-text index", + "urls": [ + "https://www.dropbox.com/s/pfouskfoxb471e6/lucene-index-covid-full-text-2020-04-10.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-04-10/lucene-index-covid-full-text-2020-04-10.tar.gz" + ], + "md5": "401a6f5583b0f05340c73fbbeb3279c8", + "size compressed (bytes)": 4471820, + "total_terms": 315624154, + "documents": 51071, + "unique_terms": 1812522, + "downloaded": False + }, + "trec-covid-r1-paragraph": { + "description": "TREC-COVID Round 1: paragraph index", + "urls": [ + "https://www.dropbox.com/s/yr0bj5pxu2k89n0/lucene-index-covid-paragraph-2020-04-10.tar.gz?dl=1", + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-04-10/lucene-index-covid-paragraph-2020-04-10.tar.gz" + ], + "md5": "8b87a2c55bc0a15b87f11e796860216a", + "size compressed (bytes)": 5994192, + "total_terms": 330715243, + "documents": 1412648, + "unique_terms": 944574, + "downloaded": False } } From f0648186f7b489333bf869021fc7119757de79d4 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Mon, 28 Dec 2020 19:09:20 -0500 Subject: [PATCH 04/20] update the github from raw html landing to raw tarball --- pyserini/prebuilt_index_info.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index 15c88e911..9281231b1 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -156,7 +156,7 @@ "description": "TREC-COVID Round 5: abstract index", "urls": [ "https://www.dropbox.com/s/9hfowxi7zenuaay/lucene-index-cord19-abstract-2020-07-16.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-07-16/lucene-index-cord19-abstract-2020-07-16.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-07-16/lucene-index-cord19-abstract-2020-07-16.tar.gz" ], "md5": "c883571ccc78b4c2ce05b41eb07f5405", "size compressed (bytes)": 2796524, @@ -169,7 +169,7 @@ "description": "TREC-COVID Round 5: full-text index", "urls": [ "https://www.dropbox.com/s/dyd9sggrqo44d0n/lucene-index-cord19-full-text-2020-07-16.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-07-16/lucene-index-cord19-full-text-2020-07-16.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-07-16/lucene-index-cord19-full-text-2020-07-16.tar.gz" ], "md5": "23cfad89b4c206d66125f5736f60248f", "size compressed (bytes)": 5351744, @@ -182,7 +182,7 @@ "description": "TREC-COVID Round 5: paragraph index", "urls": [ "https://www.dropbox.com/s/jdfbrnohtkrvds5/lucene-index-cord19-paragraph-2020-07-16.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-07-16/lucene-index-cord19-paragraph-2020-07-16.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-07-16/lucene-index-cord19-paragraph-2020-07-16.tar.gz" ], "md5": "c2c6ac832f8a1fcb767d2356d2b1e1df", "size compressed (bytes)": 11352968, @@ -195,7 +195,7 @@ "description": "TREC-COVID Round 4: abstract index", "urls": [ "https://www.dropbox.com/s/x8wbuy0atgnajfd/lucene-index-cord19-abstract-2020-06-19.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-06-19/lucene-index-cord19-abstract-2020-06-19.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-06-19/lucene-index-cord19-abstract-2020-06-19.tar.gz" ], "md5": "029bd55daba8800fbae2be9e5fcd7b33", "size compressed (bytes)": 2584264, @@ -208,7 +208,7 @@ "description": "TREC-COVID Round 4: full-text index", "urls": [ "https://www.dropbox.com/s/tf469r70r8aigu2/lucene-index-cord19-full-text-2020-06-19.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-06-19/lucene-index-cord19-full-text-2020-06-19.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-06-19/lucene-index-cord19-full-text-2020-06-19.tar.gz" ], "md5": "3d0eb12094a24cff9bcacd1f17c3ea1c", "size compressed (bytes)": 4983900, @@ -221,7 +221,7 @@ "description": "TREC-COVID Round 4: paragraph index", "urls": [ "https://www.dropbox.com/s/fr3v69vhryevwp9/lucene-index-cord19-paragraph-2020-06-19.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-06-19/lucene-index-cord19-paragraph-2020-06-19.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-06-19/lucene-index-cord19-paragraph-2020-06-19.tar.gz" ], "md5": "5cd8cd6998177bed7a3e0057ef8b3595", "size compressed (bytes)": 10382704, @@ -234,7 +234,7 @@ "description": "TREC-COVID Round 3: abstract index", "urls": [ "https://www.dropbox.com/s/7bbz6pm4rduqvx3/lucene-index-cord19-abstract-2020-05-19.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-05-19/lucene-index-cord19-abstract-2020-05-19.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-05-19/lucene-index-cord19-abstract-2020-05-19.tar.gz" ], "md5": "37bb97d0c41d650ba8e135fd75ae8fd8", "size compressed (bytes)": 2190328, @@ -247,7 +247,7 @@ "description": "TREC-COVID Round 3: full-text index", "urls": [ "https://www.dropbox.com/s/bxhldgks1rxz4ly/lucene-index-cord19-full-text-2020-05-19.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-05-19/lucene-index-cord19-full-text-2020-05-19.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-05-19/lucene-index-cord19-full-text-2020-05-19.tar.gz" ], "md5": "f5711915a66cd2b511e0fb8d03e4c325", "size compressed (bytes)": 4233300, @@ -260,7 +260,7 @@ "description": "TREC-COVID Round 3: paragraph index", "urls": [ "https://www.dropbox.com/s/2ewjchln0ihm6hh/lucene-index-cord19-paragraph-2020-05-19.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-05-19/lucene-index-cord19-paragraph-2020-05-19.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-05-19/lucene-index-cord19-paragraph-2020-05-19.tar.gz" ], "md5": "012ab1f804382b2275c433a74d7d31f2", "size compressed (bytes)": 9053524, @@ -273,7 +273,7 @@ "description": "TREC-COVID Round 2: abstract index", "urls": [ "https://www.dropbox.com/s/jdsc6wu0vbumpup/lucene-index-cord19-abstract-2020-05-01.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-05-01/lucene-index-cord19-abstract-2020-05-01.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-05-01/lucene-index-cord19-abstract-2020-05-01.tar.gz" ], "md5": "a06e71a98a68d31148cb0e97e70a2ee1", "size compressed (bytes)": 1575804, @@ -286,7 +286,7 @@ "description": "TREC-COVID Round 2: full-text index", "urls": [ "https://www.dropbox.com/s/ouvp7zyqsp9y9gh/lucene-index-cord19-full-text-2020-05-01.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-05-01/lucene-index-cord19-full-text-2020-05-01.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-05-01/lucene-index-cord19-full-text-2020-05-01.tar.gz" ], "md5": "e7eca1b976cdf2cd80e908c9ac2263cb", "size compressed (bytes)": 3088540, @@ -299,7 +299,7 @@ "description": "TREC-COVID Round 2: paragraph index", "urls": [ "https://www.dropbox.com/s/e1118vjuf58ojt4/lucene-index-cord19-paragraph-2020-05-01.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-05-01/lucene-index-cord19-paragraph-2020-05-01.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-05-01/lucene-index-cord19-paragraph-2020-05-01.tar.gz" ], "md5": "8f9321757a03985ac1c1952b2fff2c7d", "size compressed (bytes)": 6881696, @@ -312,7 +312,7 @@ "description": "TREC-COVID Round 1: abstract index", "urls": [ "https://www.dropbox.com/s/iebape2yfgkzkt1/lucene-index-covid-2020-04-10.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-04-10/lucene-index-covid-2020-04-10.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-04-10/lucene-index-covid-2020-04-10.tar.gz" ], "md5": "ec239d56498c0e7b74e3b41e1ce5d42a", "size compressed (bytes)": 1621440, @@ -325,7 +325,7 @@ "description": "TREC-COVID Round 1: full-text index", "urls": [ "https://www.dropbox.com/s/pfouskfoxb471e6/lucene-index-covid-full-text-2020-04-10.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-04-10/lucene-index-covid-full-text-2020-04-10.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-04-10/lucene-index-covid-full-text-2020-04-10.tar.gz" ], "md5": "401a6f5583b0f05340c73fbbeb3279c8", "size compressed (bytes)": 4471820, @@ -338,7 +338,7 @@ "description": "TREC-COVID Round 1: paragraph index", "urls": [ "https://www.dropbox.com/s/yr0bj5pxu2k89n0/lucene-index-covid-paragraph-2020-04-10.tar.gz?dl=1", - "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/blob/master/2020-04-10/lucene-index-covid-paragraph-2020-04-10.tar.gz" + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/-/raw/master/2020-04-10/lucene-index-covid-paragraph-2020-04-10.tar.gz" ], "md5": "8b87a2c55bc0a15b87f11e796860216a", "size compressed (bytes)": 5994192, From b370561c9046943a85ec807fc178de6fab4a7e67 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Fri, 1 Jan 2021 04:01:05 -0500 Subject: [PATCH 05/20] add test for trec_covid_r3 scripts --- integrations/test_trec_covid_r3.py | 68 ++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 integrations/test_trec_covid_r3.py diff --git a/integrations/test_trec_covid_r3.py b/integrations/test_trec_covid_r3.py new file mode 100644 index 000000000..d2a27c1da --- /dev/null +++ b/integrations/test_trec_covid_r3.py @@ -0,0 +1,68 @@ +# +# Pyserini: Python interface to the Anserini IR toolkit built on Lucene +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import hashlib +import os +import re +import shutil +import unittest +from random import randint +from pyserini.util import download_url + + +class TestSearchIntegration(unittest.TestCase): + def setUp(self): + self.round3_runs = { + 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt': + 'dfccc32efd58a8284ae411e5c6b27ce9', + 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt': + '7a5c27e8e052c49ff72d557051825973' + } + self.tmp = f'../trec-covid-r3/check_data' + + # In the rare event there's a collision + if os.path.exists(self.tmp): + shutil.rmtree(self.tmp) + + os.mkdir(self.tmp) + for url in self.round3_runs: + print(f'Verifying stored run at {url}...') + filename = url.split('/')[-1] + filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter + + download_url(url, self.tmp, md5=self.round3_runs[url], force=True) + self.assertTrue(os.path.exists(os.path.join(self.tmp, filename))) + print('') + + + def test_round3_score_runs(self): + os.system(f'python3 ../trec-covid-r3/ranker.py \ + -alpha 0.6 \ + -clf lr \ + -vectorizer tfidf \ + -trec_covid_home ../trec-covid-r3 \ + -base ../trec-covid-r3/data/covidex.r4.d2q.duot5 \ + -qrels {self.tmp}/qrels.covid-round3-cumulative.txt \ + -index ../trec-covid-r3/data/lucene-index-cord19-abstract-2020-06-19 \ + -tag ../trec-covid-r3/data/covidex.r4.d2q.duot5.lr') + + + # def tearDown(self): + # shutil.rmtree(self.tmp) + # shutil.rmtree('runs') + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From fa4d12936fbdefddd2a75d00d924187b0774dc64 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Fri, 1 Jan 2021 15:49:11 -0500 Subject: [PATCH 06/20] fix bug on pre-built index and terdown function --- integrations/test_trec_covid_r3.py | 45 +++++++++++++++++++----------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/integrations/test_trec_covid_r3.py b/integrations/test_trec_covid_r3.py index d2a27c1da..0ba0dd752 100644 --- a/integrations/test_trec_covid_r3.py +++ b/integrations/test_trec_covid_r3.py @@ -18,19 +18,26 @@ import re import shutil import unittest +import json + +import sys +sys.path.append('..') + from random import randint -from pyserini.util import download_url +from pyserini.util import download_url,download_prebuilt_index +from integrations.simplesearcher_checker import SimpleSearcherChecker + class TestSearchIntegration(unittest.TestCase): def setUp(self): - self.round3_runs = { + self.round4_runs = { 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt': 'dfccc32efd58a8284ae411e5c6b27ce9', 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt': '7a5c27e8e052c49ff72d557051825973' } - self.tmp = f'../trec-covid-r3/check_data' + self.tmp = f'tmp{randint(0, 10000)}' # In the rare event there's a collision if os.path.exists(self.tmp): @@ -47,21 +54,27 @@ def setUp(self): print('') - def test_round3_score_runs(self): - os.system(f'python3 ../trec-covid-r3/ranker.py \ - -alpha 0.6 \ - -clf lr \ - -vectorizer tfidf \ - -trec_covid_home ../trec-covid-r3 \ - -base ../trec-covid-r3/data/covidex.r4.d2q.duot5 \ - -qrels {self.tmp}/qrels.covid-round3-cumulative.txt \ - -index ../trec-covid-r3/data/lucene-index-cord19-abstract-2020-06-19 \ - -tag ../trec-covid-r3/data/covidex.r4.d2q.duot5.lr') + def test_bm25(self): + + prebuilt_index_path = download_prebuilt_index('trec-covid-r4-abstract') + os.system(f'python ../trec-covid-r3/ranker.py \ + -alpha 0.6 \ + -clf lr \ + -vectorizer tfidf \ + -trec_covid_home ../trec-covid-r3 \ + -base ../trec-covid-r3/data/covidex.r4.d2q.duot5 \ + -qrels ../tools/topics-and-qrels/qrels.covid-round3-cumulative.txt \ + -index {prebuilt_index_path} \ + -tag ../trec-covid-r3/data/covidex.r4.d2q.duot5.lr \ + -output {self.tmp}/output.json') + with open(f'{self.tmp}/output.json') as json_file: + data = json.load(json_file) + self.assertEqual("0.1764\\n'", data['map']) + self.assertEqual("0.7662\\n'", data['ndcg']) - # def tearDown(self): - # shutil.rmtree(self.tmp) - # shutil.rmtree('runs') + def tearDown(self): + shutil.rmtree(self.tmp) if __name__ == '__main__': From 6ea7db8b6b18389ca83d45611ccd46c0463eee52 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Sat, 2 Jan 2021 12:38:48 -0500 Subject: [PATCH 07/20] fix typo on function name --- integrations/test_trec_covid_r3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/test_trec_covid_r3.py b/integrations/test_trec_covid_r3.py index 0ba0dd752..409658c35 100644 --- a/integrations/test_trec_covid_r3.py +++ b/integrations/test_trec_covid_r3.py @@ -44,12 +44,12 @@ def setUp(self): shutil.rmtree(self.tmp) os.mkdir(self.tmp) - for url in self.round3_runs: + for url in self.round4_runs: print(f'Verifying stored run at {url}...') filename = url.split('/')[-1] filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter - download_url(url, self.tmp, md5=self.round3_runs[url], force=True) + download_url(url, self.tmp, md5=self.round4_runs[url], force=True) self.assertTrue(os.path.exists(os.path.join(self.tmp, filename))) print('') From e9828ee08436d05706fe5b5658d8f9f1da7b9f40 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Sun, 3 Jan 2021 10:34:48 -0500 Subject: [PATCH 08/20] add script for downloading files to runs --- integrations/test_trec_covid_r3.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/integrations/test_trec_covid_r3.py b/integrations/test_trec_covid_r3.py index 409658c35..1e1a57b63 100644 --- a/integrations/test_trec_covid_r3.py +++ b/integrations/test_trec_covid_r3.py @@ -19,6 +19,7 @@ import shutil import unittest import json +import gzip import sys sys.path.append('..') @@ -35,9 +36,15 @@ def setUp(self): 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt': 'dfccc32efd58a8284ae411e5c6b27ce9', 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt': - '7a5c27e8e052c49ff72d557051825973' + '7a5c27e8e052c49ff72d557051825973', } + self.tmp = f'tmp{randint(0, 10000)}' + download_url('https://ir.nist.gov/covidSubmit/archive/round4/covidex.r4.d2q.duot5.gz', 'runs') + + with gzip.open(f'runs/covidex.r4.d2q.duot5.gz', 'rb') as f_in: + with open(f'runs/covidex.r4.d2q.duot5', 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) # In the rare event there's a collision if os.path.exists(self.tmp): @@ -57,15 +64,15 @@ def setUp(self): def test_bm25(self): prebuilt_index_path = download_prebuilt_index('trec-covid-r4-abstract') - os.system(f'python ../trec-covid-r3/ranker.py \ + os.system(f'python3 ../../trec-covid-r3/ranker.py \ -alpha 0.6 \ -clf lr \ -vectorizer tfidf \ - -trec_covid_home ../trec-covid-r3 \ - -base ../trec-covid-r3/data/covidex.r4.d2q.duot5 \ + -new_qrels ../tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \ + -base runs/covidex.r4.d2q.duot5 \ -qrels ../tools/topics-and-qrels/qrels.covid-round3-cumulative.txt \ -index {prebuilt_index_path} \ - -tag ../trec-covid-r3/data/covidex.r4.d2q.duot5.lr \ + -tag ../../trec-covid-r3/data/covidex.r4.d2q.duot5.lr \ -output {self.tmp}/output.json') with open(f'{self.tmp}/output.json') as json_file: data = json.load(json_file) @@ -75,6 +82,9 @@ def test_bm25(self): def tearDown(self): shutil.rmtree(self.tmp) + os.remove('runs/covidex.r4.d2q.duot5.gz') + os.remove('runs/covidex.r4.d2q.duot5') + os.remove('runs/covidex.r4.d2q.duot5.lr.tfidf.R12.A0.6.txt') if __name__ == '__main__': From 7c8554263916b15ddfb865364fc1ee8474c948fa Mon Sep 17 00:00:00 2001 From: Yuqi Date: Sun, 3 Jan 2021 21:20:54 -0500 Subject: [PATCH 09/20] allow the script to be called outside integration --- integrations/test_trec_covid_r3.py | 43 ++++++++++++++++-------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/integrations/test_trec_covid_r3.py b/integrations/test_trec_covid_r3.py index 1e1a57b63..fa43f4371 100644 --- a/integrations/test_trec_covid_r3.py +++ b/integrations/test_trec_covid_r3.py @@ -23,11 +23,10 @@ import sys sys.path.append('..') - +print(sys.path) from random import randint -from pyserini.util import download_url,download_prebuilt_index from integrations.simplesearcher_checker import SimpleSearcherChecker - +from pyserini.util import download_url, download_prebuilt_index class TestSearchIntegration(unittest.TestCase): @@ -38,12 +37,18 @@ def setUp(self): 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt': '7a5c27e8e052c49ff72d557051825973', } - - self.tmp = f'tmp{randint(0, 10000)}' - download_url('https://ir.nist.gov/covidSubmit/archive/round4/covidex.r4.d2q.duot5.gz', 'runs') - - with gzip.open(f'runs/covidex.r4.d2q.duot5.gz', 'rb') as f_in: - with open(f'runs/covidex.r4.d2q.duot5', 'wb') as f_out: + curdir = os.getcwd() + if curdir.endswith('integrations'): + self.pyserini_root = '..' + else: + self.pyserini_root = '.' + + self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}' + download_url('https://ir.nist.gov/covidSubmit/archive/round4/covidex.r4.d2q.duot5.gz', + f'{self.pyserini_root}/integrations/runs') + + with gzip.open(f'{self.pyserini_root}/integrations/runs/covidex.r4.d2q.duot5.gz', 'rb') as f_in: + with open(f'{self.pyserini_root}/integrations/runs/covidex.r4.d2q.duot5', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # In the rare event there's a collision @@ -60,32 +65,30 @@ def setUp(self): self.assertTrue(os.path.exists(os.path.join(self.tmp, filename))) print('') - def test_bm25(self): prebuilt_index_path = download_prebuilt_index('trec-covid-r4-abstract') - os.system(f'python3 ../../trec-covid-r3/ranker.py \ + os.system(f'python {self.pyserini_root}/../trec-covid-r3/ranker.py \ -alpha 0.6 \ -clf lr \ -vectorizer tfidf \ - -new_qrels ../tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \ - -base runs/covidex.r4.d2q.duot5 \ - -qrels ../tools/topics-and-qrels/qrels.covid-round3-cumulative.txt \ + -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \ + -base {self.pyserini_root}/integrations/runs/covidex.r4.d2q.duot5 \ + -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round3-cumulative.txt \ -index {prebuilt_index_path} \ - -tag ../../trec-covid-r3/data/covidex.r4.d2q.duot5.lr \ + -tag {self.pyserini_root}/../trec-covid-r3/data/covidex.r4.d2q.duot5.lr \ -output {self.tmp}/output.json') with open(f'{self.tmp}/output.json') as json_file: data = json.load(json_file) self.assertEqual("0.1764\\n'", data['map']) self.assertEqual("0.7662\\n'", data['ndcg']) - def tearDown(self): shutil.rmtree(self.tmp) - os.remove('runs/covidex.r4.d2q.duot5.gz') - os.remove('runs/covidex.r4.d2q.duot5') - os.remove('runs/covidex.r4.d2q.duot5.lr.tfidf.R12.A0.6.txt') + os.remove(f'{self.pyserini_root}/integrations/runs/covidex.r4.d2q.duot5.gz') + os.remove(f'{self.pyserini_root}/integrations/runs/covidex.r4.d2q.duot5') + os.remove(f'{self.pyserini_root}/integrations/runs/covidex.r4.d2q.duot5.lr.tfidf.R12.A0.6.txt') if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() From 1ae8c58e894f445520dee1e1587a33b2ff4ac8d6 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Mon, 4 Jan 2021 15:51:47 -0500 Subject: [PATCH 10/20] add trec covid ranker file to pyserini --- integrations/simplesearcher_checker.py | 4 ++- integrations/test_trec_covid_r3.py | 40 ++++++++++++++------------ 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/integrations/simplesearcher_checker.py b/integrations/simplesearcher_checker.py index 889bd2a3d..7c08433bd 100644 --- a/integrations/simplesearcher_checker.py +++ b/integrations/simplesearcher_checker.py @@ -31,7 +31,8 @@ def __init__(self, anserini_root: str, index: str, topics: str, pyserini_topics: 'target/appassembler/bin/SearchCollection -topicreader Trec') self.pyserini_base_cmd = 'python3 -m pyserini.search' - self.eval_base_cmd = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30' + # self.eval_base_cmd = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30' + self.eval_base_cmd = 'tools/eval/trec_eval.9.0.4/trec_eval -c -M1000 -m map -m ndcg_cut_20' @staticmethod def _cleanup(files: List[str]): @@ -51,6 +52,7 @@ def run(self, runtag: str, anserini_extras: str, pyserini_extras: str): + f'-topics {self.topics} -output {anserini_output} {anserini_extras}' pyserini_cmd = f'{self.pyserini_base_cmd} --index {self.index_path} ' \ + f'--topics {self.pyserini_topics} --output {pyserini_output} {pyserini_extras}' + print(pyserini_cmd) status = os.system(anserini_cmd) if not status == 0: diff --git a/integrations/test_trec_covid_r3.py b/integrations/test_trec_covid_r3.py index fa43f4371..c8aa083b9 100644 --- a/integrations/test_trec_covid_r3.py +++ b/integrations/test_trec_covid_r3.py @@ -23,7 +23,6 @@ import sys sys.path.append('..') -print(sys.path) from random import randint from integrations.simplesearcher_checker import SimpleSearcherChecker from pyserini.util import download_url, download_prebuilt_index @@ -31,12 +30,7 @@ class TestSearchIntegration(unittest.TestCase): def setUp(self): - self.round4_runs = { - 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt': - 'dfccc32efd58a8284ae411e5c6b27ce9', - 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt': - '7a5c27e8e052c49ff72d557051825973', - } + curdir = os.getcwd() if curdir.endswith('integrations'): self.pyserini_root = '..' @@ -44,18 +38,28 @@ def setUp(self): self.pyserini_root = '.' self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}' - download_url('https://ir.nist.gov/covidSubmit/archive/round4/covidex.r4.d2q.duot5.gz', - f'{self.pyserini_root}/integrations/runs') - - with gzip.open(f'{self.pyserini_root}/integrations/runs/covidex.r4.d2q.duot5.gz', 'rb') as f_in: - with open(f'{self.pyserini_root}/integrations/runs/covidex.r4.d2q.duot5', 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) # In the rare event there's a collision if os.path.exists(self.tmp): shutil.rmtree(self.tmp) os.mkdir(self.tmp) + os.mkdir(f'{self.tmp}/runs') + + self.round4_runs = { + 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt': + 'dfccc32efd58a8284ae411e5c6b27ce9', + 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt': + '7a5c27e8e052c49ff72d557051825973', + } + + download_url('https://ir.nist.gov/covidSubmit/archive/round4/covidex.r4.d2q.duot5.gz', + f'{self.tmp}/runs') + + with gzip.open(f'{self.tmp}/runs/covidex.r4.d2q.duot5.gz', 'rb') as f_in: + with open(f'{self.tmp}/runs/covidex.r4.d2q.duot5', 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + for url in self.round4_runs: print(f'Verifying stored run at {url}...') filename = url.split('/')[-1] @@ -66,14 +70,15 @@ def setUp(self): print('') def test_bm25(self): - + tmp_folder_name = self.tmp.split('/')[-1] prebuilt_index_path = download_prebuilt_index('trec-covid-r4-abstract') - os.system(f'python {self.pyserini_root}/../trec-covid-r3/ranker.py \ + os.system(f'python {self.pyserini_root}/scripts/trec-covid-r3-ranker.py \ -alpha 0.6 \ -clf lr \ -vectorizer tfidf \ -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \ - -base {self.pyserini_root}/integrations/runs/covidex.r4.d2q.duot5 \ + -base {self.tmp}/runs/covidex.r4.d2q.duot5 \ + -tmp_base {tmp_folder_name} \ -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round3-cumulative.txt \ -index {prebuilt_index_path} \ -tag {self.pyserini_root}/../trec-covid-r3/data/covidex.r4.d2q.duot5.lr \ @@ -85,9 +90,6 @@ def test_bm25(self): def tearDown(self): shutil.rmtree(self.tmp) - os.remove(f'{self.pyserini_root}/integrations/runs/covidex.r4.d2q.duot5.gz') - os.remove(f'{self.pyserini_root}/integrations/runs/covidex.r4.d2q.duot5') - os.remove(f'{self.pyserini_root}/integrations/runs/covidex.r4.d2q.duot5.lr.tfidf.R12.A0.6.txt') if __name__ == '__main__': From eb49795835d5f3249eed77ed7baad4dfb908f4d7 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Mon, 4 Jan 2021 15:56:21 -0500 Subject: [PATCH 11/20] fix a typo on simplecheck --- integrations/simplesearcher_checker.py | 4 +- scripts/trec-covid-r3-ranker.py | 298 +++++++++++++++++++++++++ 2 files changed, 299 insertions(+), 3 deletions(-) create mode 100644 scripts/trec-covid-r3-ranker.py diff --git a/integrations/simplesearcher_checker.py b/integrations/simplesearcher_checker.py index 7c08433bd..889bd2a3d 100644 --- a/integrations/simplesearcher_checker.py +++ b/integrations/simplesearcher_checker.py @@ -31,8 +31,7 @@ def __init__(self, anserini_root: str, index: str, topics: str, pyserini_topics: 'target/appassembler/bin/SearchCollection -topicreader Trec') self.pyserini_base_cmd = 'python3 -m pyserini.search' - # self.eval_base_cmd = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30' - self.eval_base_cmd = 'tools/eval/trec_eval.9.0.4/trec_eval -c -M1000 -m map -m ndcg_cut_20' + self.eval_base_cmd = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30' @staticmethod def _cleanup(files: List[str]): @@ -52,7 +51,6 @@ def run(self, runtag: str, anserini_extras: str, pyserini_extras: str): + f'-topics {self.topics} -output {anserini_output} {anserini_extras}' pyserini_cmd = f'{self.pyserini_base_cmd} --index {self.index_path} ' \ + f'--topics {self.pyserini_topics} --output {pyserini_output} {pyserini_extras}' - print(pyserini_cmd) status = os.system(anserini_cmd) if not status == 0: diff --git a/scripts/trec-covid-r3-ranker.py b/scripts/trec-covid-r3-ranker.py new file mode 100644 index 000000000..a7d3c7208 --- /dev/null +++ b/scripts/trec-covid-r3-ranker.py @@ -0,0 +1,298 @@ +import argparse +import os +import json +import sys +sys.path.append('..') +sys.path.append('../pyserini') +import subprocess + +from enum import Enum +from pyserini.vectorizer import TfidfVectorizer +from pyserini.vectorizer import BM25Vectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import LogisticRegression +from sklearn.svm import SVC +from typing import List +from sklearn import preprocessing +from typing import List, Set + +def normalize(scores): + low = min(scores) + high = max(scores) + width = high - low + + return [(s-low)/width for s in scores] + + +def sort_dual_list(pred, docs): + zipped_lists = zip(pred, docs) + sorted_pairs = sorted(zipped_lists) + + tuples = zip(*sorted_pairs) + pred, docs = [list(tuple) for tuple in tuples] + + pred.reverse() + docs.reverse() + return pred, docs + + +def sort_str_topics_list(topics: List[str]) -> List[str]: + res = sorted([int(t) for t in topics]) + return [str(t) for t in res] + + +def get_topics_from_qrun(path: str) -> Set[str]: + res = set() + with open(path, 'r') as f: + for line in f: + res.add(line.split()[0]) + return sort_str_topics_list(res) + + +def get_lines_by_topic(path, topic, tag): + res = [] + with open(path, 'r') as f: + for line in f: + tokens = line.split() + if tokens[0] != topic: + continue + tokens[-1] = tag + new_line = ' '.join(tokens) + res.append(new_line) + + return res + + +def read_qrels(path: str): + qrels = [] + + with open(path, 'r') as f: + for line in f: + line = line.strip() + tokens = line.split() + topic = tokens[0] + doc_id = tokens[-2] + relevance = int(tokens[-1]) + qrels.append({ + 'topic': topic, + 'doc_id': doc_id, + 'relevance': relevance + }) + + return qrels + + +def get_doc_to_id_from_qrun_by_topic(path: str, topic: str): + res = {} + with open(path, 'r') as f: + for line in f: + tokens = line.strip().split() + t = tokens[0] + if topic != t: + continue + doc_id = tokens[2] + score = float(tokens[-2]) + res[doc_id] = score + + return res + + +def get_docs_from_qrun_by_topic(path: str, topic: str): + x, y = [], [] + with open(path, 'r') as f: + for line in f: + tokens = line.strip().split() + t = tokens[0] + if topic != t: + continue + doc_id = tokens[2] + score = float(tokens[-2]) + x.append(doc_id) + y.append(score) + + return x, y + + +def get_X_Y_from_qrels_by_topic(path: str, topic: str, R: List[int]): + # always include topic 0 + R.append(0) + qrels = [qrel for qrel in read_qrels(path) if qrel['topic'] == topic and qrel['relevance'] in R] + x, y = [], [] + for pack in qrels: + x.append(pack['doc_id']) + label = 0 if pack['relevance'] == 0 else 1 + y.append(label) + + return x, y + + +class SpecterVectorizer: + def __init__(self): + path = "data/specter.csv" + self.vectors = {} + + with open(path, 'r') as f: + for line in f: + tokens = line.strip().split(',') + doc_id = tokens[0] + vector = [float(item) for item in tokens[1:]] + self.vectors[doc_id] = vector + + def get_vectors(self, doc_ids: List[str]): + res = [] + + for doc_id in doc_ids: + if doc_id in self.vectors: + res.append(self.vectors[doc_id]) + else: + print(f'{doc_id} not found') + + return preprocessing.normalize(res) + + +class ClassifierType(Enum): + SVM = 'svm' + LR = 'lr' + NB = 'nb' + + +ClassifierStr = { + ClassifierType.SVM: 'svm', + ClassifierType.LR: 'lr', + ClassifierType.NB: 'nb', +} + + +class VectorizerType(Enum): + TFIDF = 'tfidf' + BM25 = 'bm25' + SPECTER = 'specter' + + +VectorizerStr = { + VectorizerType.TFIDF: 'tfidf', + VectorizerType.BM25: 'bm25', + VectorizerType.SPECTER: 'specter', +} + + +def evaluate(qrels_path: str, run_path: str, options: str = ''): + curdir = os.getcwd() + if curdir.endswith('integrations'): + anserini_root = '../../anserini' + else: + anserini_root = '../anserini' + prefix = f"{anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -c -M1000 -m all_trec {qrels_path}" + cmd1 = f"{prefix} {run_path} {options} | grep 'ndcg_cut_20 '" + cmd2 = f"{prefix} {run_path} {options} | grep 'map '" + ndcg_score = str(subprocess.check_output(cmd1, shell=True)).split('\\t')[-1] + map_score = str(subprocess.check_output(cmd2, shell=True)).split('\\t')[-1] + return str(map_score),str(ndcg_score) + + +def rank(new_qrels: str, base: str,tmp_base:str, qrels_path: str, lucene_index_path: str, R: List[int], score_path: str, alpha: float, clf_type: ClassifierType, vec_type: VectorizerType, tag: str): + # build output path + base_str = base.split('/')[-1] + R_str = ''.join([str(i) for i in R]) + curdir = os.getcwd() + if curdir.endswith('integrations'): + output_path = f'{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt' + else: + output_path = f'integrations/{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt' + print(f'Output -> {output_path}') + os.system('mkdir -p runs') + + vectorizer = None + if vec_type == VectorizerType.TFIDF: + vectorizer = TfidfVectorizer(lucene_index_path, min_df=5) + elif vec_type == VectorizerType.SPECTER: + base += '.specter' + qrels_path += '.specter' + vectorizer = SpecterVectorizer() + elif vec_type == VectorizerType.BM25: + vectorizer = BM25Vectorizer(lucene_index_path, min_df=5) + else: + print('invalid vectorizer') + exit() + + f = open(output_path, 'w+') + + skipped_topics = set() + topics = get_topics_from_qrun(base) + for topic in topics: + train_docs, train_labels = get_X_Y_from_qrels_by_topic(qrels_path, topic, R) + if len(train_docs) == 0: + print(f'[topic][{topic}] skipped') + skipped_topics.add(topic) + continue + + print(f'[topic][{topic}] eligible train docs {len(train_docs)}') + + clf = None + if clf_type == ClassifierType.NB: + clf = MultinomialNB() + elif clf_type == ClassifierType.LR: + clf = LogisticRegression() + elif clf_type == ClassifierType.SVM: + clf = SVC(kernel='linear', probability=True) + else: + print('ClassifierType not supported') + exit() + + train_vectors = vectorizer.get_vectors(train_docs) + clf.fit(train_vectors, train_labels) + + test_docs, base_scores = get_docs_from_qrun_by_topic(base, topic) + print(f'[topic][{topic}] eligible test docs {len(test_docs)}') + test_vectors = vectorizer.get_vectors(test_docs) + + rank_scores = clf.predict_proba(test_vectors) + rank_scores = [row[1] for row in rank_scores] + + rank_scores = normalize(rank_scores) + base_scores = normalize(base_scores) + + preds = [a * alpha + b * (1-alpha) for a, b in zip(rank_scores, base_scores)] + preds, docs = sort_dual_list(preds, test_docs) + + for index, (score, doc_id) in enumerate(zip(preds, docs)): + rank = index + 1 + f.write(f'{topic} Q0 {doc_id} {rank} {score} {tag}\n') + + for topic in sort_str_topics_list(list(skipped_topics)): + lines = get_lines_by_topic(base, topic, tag) + print(f'Copying over skipped topic {topic} with {len(lines)} lines') + for line in lines: + f.write(f'{line}\n') + + f.close() + map_score,ndcg_score = evaluate(new_qrels, output_path) + with open(score_path, 'w') as outfile: + json.dump({'map':map_score,'ndcg':ndcg_score}, outfile) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='use tfidf vectorizer on cord-19 dataset with ccrf technique') + parser.add_argument('-tag', type=str, default="interpolation", + metavar="tag_name", help='tag name for resulting Qrun') + parser.add_argument('-new_qrels', type=str, default="data/qrels-rnd1+2+3+4.txt", + metavar="path_to_new_qrels", help='path to new_qrels file') + parser.add_argument('-base', type=str, default="data/covidex.t5.final.txt", + metavar="path_to_base_run", help='path to base run') + parser.add_argument('-tmp_base', type=str, default="tmp101}", + metavar="tmp file folder name", help='"tmp file folder name') + parser.add_argument('-qrels', type=str, default="data/qrels-rnd1+2.txt", + metavar="path_to_qrels", help='path to qrels file') + parser.add_argument('-index', type=str, default="data/lucene-index-cord19-abstract-2020-05-19", + metavar="path_to_lucene_index", help='path to lucene index folder') + parser.add_argument('-output', type=str, default="data/output.json", + metavar="path_to_base_run", help='the path to map and ndcg scores') + parser.add_argument('-alpha', type=float, required=True, help='alpha value for interpolation') + parser.add_argument('-clf', type=ClassifierType, required=True, help='which classifier to use') + parser.add_argument('-vectorizer', type=VectorizerType, required=True, help='which vectorizer to use') + args = parser.parse_args() + + R = [1, 2] + print('Using base run:', args.base) + rank(args.new_qrels, args.base, args.tmp_base, args.qrels, args.index, R, args.output, args.alpha, args.clf, args.vectorizer, args.tag) From 2de09428785fa03e143bd7b51ac948b30a0f0102 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Thu, 7 Jan 2021 23:47:35 -0500 Subject: [PATCH 12/20] update the evaluating qrel files --- integrations/test_trec_covid_r3.py | 29 +++--- integrations/test_trec_covid_r4.py | 96 +++++++++++++++++++ ...ovid-r3-ranker.py => trec-covid-ranker.py} | 0 3 files changed, 109 insertions(+), 16 deletions(-) create mode 100644 integrations/test_trec_covid_r4.py rename scripts/{trec-covid-r3-ranker.py => trec-covid-ranker.py} (100%) diff --git a/integrations/test_trec_covid_r3.py b/integrations/test_trec_covid_r3.py index c8aa083b9..900316ed9 100644 --- a/integrations/test_trec_covid_r3.py +++ b/integrations/test_trec_covid_r3.py @@ -46,47 +46,44 @@ def setUp(self): os.mkdir(self.tmp) os.mkdir(f'{self.tmp}/runs') - self.round4_runs = { + self.round3_runs = { 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt': 'dfccc32efd58a8284ae411e5c6b27ce9', 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt': '7a5c27e8e052c49ff72d557051825973', } - download_url('https://ir.nist.gov/covidSubmit/archive/round4/covidex.r4.d2q.duot5.gz', + download_url('https://storage.googleapis.com/neuralresearcher_data/trec_covid/data/53/covidex.t5.final.txt', f'{self.tmp}/runs') - with gzip.open(f'{self.tmp}/runs/covidex.r4.d2q.duot5.gz', 'rb') as f_in: - with open(f'{self.tmp}/runs/covidex.r4.d2q.duot5', 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) - for url in self.round4_runs: + for url in self.round3_runs: print(f'Verifying stored run at {url}...') filename = url.split('/')[-1] filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter - download_url(url, self.tmp, md5=self.round4_runs[url], force=True) + download_url(url, self.tmp, md5=self.round3_runs[url], force=True) self.assertTrue(os.path.exists(os.path.join(self.tmp, filename))) print('') def test_bm25(self): tmp_folder_name = self.tmp.split('/')[-1] - prebuilt_index_path = download_prebuilt_index('trec-covid-r4-abstract') - os.system(f'python {self.pyserini_root}/scripts/trec-covid-r3-ranker.py \ - -alpha 0.6 \ + prebuilt_index_path = download_prebuilt_index('trec-covid-r3-abstract') + os.system(f'python3 {self.pyserini_root}/scripts/trec-covid-ranker.py \ + -alpha 0.5 \ -clf lr \ -vectorizer tfidf \ - -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \ - -base {self.tmp}/runs/covidex.r4.d2q.duot5 \ + -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round3.txt \ + -base {self.tmp}/runs/covidex.t5.final.txt \ -tmp_base {tmp_folder_name} \ - -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round3-cumulative.txt \ + -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round2-cumulative.txt \ -index {prebuilt_index_path} \ - -tag {self.pyserini_root}/../trec-covid-r3/data/covidex.r4.d2q.duot5.lr \ + -tag covidex.r3.t5.lr \ -output {self.tmp}/output.json') with open(f'{self.tmp}/output.json') as json_file: data = json.load(json_file) - self.assertEqual("0.1764\\n'", data['map']) - self.assertEqual("0.7662\\n'", data['ndcg']) + self.assertEqual("0.3311\\n'", data['map']) + self.assertEqual("0.6866\\n'", data['ndcg']) def tearDown(self): shutil.rmtree(self.tmp) diff --git a/integrations/test_trec_covid_r4.py b/integrations/test_trec_covid_r4.py new file mode 100644 index 000000000..adce8968c --- /dev/null +++ b/integrations/test_trec_covid_r4.py @@ -0,0 +1,96 @@ +# +# Pyserini: Python interface to the Anserini IR toolkit built on Lucene +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import hashlib +import os +import re +import shutil +import unittest +import json +import gzip + +import sys +sys.path.append('..') +from random import randint +from integrations.simplesearcher_checker import SimpleSearcherChecker +from pyserini.util import download_url, download_prebuilt_index + + +class TestSearchIntegration(unittest.TestCase): + def setUp(self): + + curdir = os.getcwd() + if curdir.endswith('integrations'): + self.pyserini_root = '..' + else: + self.pyserini_root = '.' + + self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}' + + # In the rare event there's a collision + if os.path.exists(self.tmp): + shutil.rmtree(self.tmp) + + os.mkdir(self.tmp) + os.mkdir(f'{self.tmp}/runs') + + self.round4_runs = { + 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt': + 'dfccc32efd58a8284ae411e5c6b27ce9', + 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt': + '7a5c27e8e052c49ff72d557051825973', + } + + download_url('https://ir.nist.gov/covidSubmit/archive/round4/covidex.r4.d2q.duot5.gz', + f'{self.tmp}/runs') + + with gzip.open(f'{self.tmp}/runs/covidex.r4.d2q.duot5.gz', 'rb') as f_in: + with open(f'{self.tmp}/runs/covidex.r4.d2q.duot5', 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + for url in self.round4_runs: + print(f'Verifying stored run at {url}...') + filename = url.split('/')[-1] + filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter + + download_url(url, self.tmp, md5=self.round4_runs[url], force=True) + self.assertTrue(os.path.exists(os.path.join(self.tmp, filename))) + print('') + + def test_bm25(self): + tmp_folder_name = self.tmp.split('/')[-1] + prebuilt_index_path = download_prebuilt_index('trec-covid-r4-abstract') + os.system(f'python3 {self.pyserini_root}/scripts/trec-covid-ranker.py \ + -alpha 0.6 \ + -clf lr \ + -vectorizer tfidf \ + -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4.txt \ + -base {self.tmp}/runs/covidex.r4.d2q.duot5 \ + -tmp_base {tmp_folder_name} \ + -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round3-cumulative.txt \ + -index {prebuilt_index_path} \ + -tag covidex.r4.d2q.duot5.lr \ + -output {self.tmp}/output.json') + with open(f'{self.tmp}/output.json') as json_file: + data = json.load(json_file) + self.assertEqual("0.3846\\n'", data['map']) + self.assertEqual("0.7745\\n'", data['ndcg']) + + def tearDown(self): + shutil.rmtree(self.tmp) + + +if __name__ == '__main__': + unittest.main() diff --git a/scripts/trec-covid-r3-ranker.py b/scripts/trec-covid-ranker.py similarity index 100% rename from scripts/trec-covid-r3-ranker.py rename to scripts/trec-covid-ranker.py From bb3232eee769961981e9536af416c827a9ad267f Mon Sep 17 00:00:00 2001 From: Yuqi Date: Fri, 8 Jan 2021 00:51:23 -0500 Subject: [PATCH 13/20] fix a typo --- integrations/test_trec_covid_r3.py | 2 +- integrations/test_trec_covid_r4.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/test_trec_covid_r3.py b/integrations/test_trec_covid_r3.py index 900316ed9..84aa0c102 100644 --- a/integrations/test_trec_covid_r3.py +++ b/integrations/test_trec_covid_r3.py @@ -69,7 +69,7 @@ def setUp(self): def test_bm25(self): tmp_folder_name = self.tmp.split('/')[-1] prebuilt_index_path = download_prebuilt_index('trec-covid-r3-abstract') - os.system(f'python3 {self.pyserini_root}/scripts/trec-covid-ranker.py \ + os.system(f'python {self.pyserini_root}/scripts/trec-covid-ranker.py \ -alpha 0.5 \ -clf lr \ -vectorizer tfidf \ diff --git a/integrations/test_trec_covid_r4.py b/integrations/test_trec_covid_r4.py index adce8968c..d22604870 100644 --- a/integrations/test_trec_covid_r4.py +++ b/integrations/test_trec_covid_r4.py @@ -72,7 +72,7 @@ def setUp(self): def test_bm25(self): tmp_folder_name = self.tmp.split('/')[-1] prebuilt_index_path = download_prebuilt_index('trec-covid-r4-abstract') - os.system(f'python3 {self.pyserini_root}/scripts/trec-covid-ranker.py \ + os.system(f'python {self.pyserini_root}/scripts/trec-covid-ranker.py \ -alpha 0.6 \ -clf lr \ -vectorizer tfidf \ From a9f43466be522d4045e492f237dc3ba3546e1a5e Mon Sep 17 00:00:00 2001 From: Yuqi Date: Sun, 18 Apr 2021 12:57:00 -0400 Subject: [PATCH 14/20] add row1-3 unit tests for prf results table --- integrations/test_simplesearcher_check_prf.py | 109 +++++++ integrations/test_trec_covid_r3.py | 93 ------ integrations/test_trec_covid_r4.py | 96 ------ scripts/classifier_prf/cross_validate.py | 4 +- scripts/trec-covid-ranker.py | 298 ------------------ 5 files changed, 111 insertions(+), 489 deletions(-) create mode 100644 integrations/test_simplesearcher_check_prf.py delete mode 100644 integrations/test_trec_covid_r3.py delete mode 100644 integrations/test_trec_covid_r4.py delete mode 100644 scripts/trec-covid-ranker.py diff --git a/integrations/test_simplesearcher_check_prf.py b/integrations/test_simplesearcher_check_prf.py new file mode 100644 index 000000000..054741ca1 --- /dev/null +++ b/integrations/test_simplesearcher_check_prf.py @@ -0,0 +1,109 @@ +# +# Pyserini: Python interface to the Anserini IR toolkit built on Lucene +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import gzip +import os +import json +import shutil +import unittest +from random import randint +import tarfile +from pyserini.util import download_url +from integrations.utils import run_command, parse_score +from integrations.simplesearcher_checker import SimpleSearcherChecker + + +class TestSearchIntegration(unittest.TestCase): + def setUp(self): + curdir = os.getcwd() + if curdir.endswith('integrations'): + self.pyserini_root = '..' + self.anserini_root = '../../anserini' + else: + self.pyserini_root = '.' + self.anserini_root = '../anserini' + + self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}' + + self.checker = SimpleSearcherChecker( + anserini_root=self.anserini_root, + index=os.path.join(self.anserini_root, 'indexes/lucene-index.core18.pos+docvectors+raw'), + topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.core18.txt'), + pyserini_topics='core18', + qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt')) + + if os.path.exists(self.tmp): + shutil.rmtree(self.tmp) + else: + os.mkdir(self.tmp) + + download_url('https://www.dropbox.com/s/6b81d5na2iuyvnc/core18.tar.gz?dl=1', f'{self.pyserini_root}/integrations') + + if os.path.exists(f'{self.tmp}/core18') == False: + tar = tarfile.open(f"{self.pyserini_root}/integrations/core18.tar.gz", "r:gz") + tar.extractall(path=f'{self.tmp}') + tar.close() + + def test_core18(self): + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.anserini_root}/runs/run.core18.bm25.topics.core18.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2495, delta=0.0001) + + def test_core18_lr(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core18 --output {self.tmp}/core18_lr.txt --classifier lr ') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_lr.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2837, delta=0.0001) + + def test_core18_svm(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core18 --output {self.tmp}/core18_svm.txt --classifier svm ') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_svm.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2786, delta=0.0001) + + def tearDown(self): + shutil.rmtree(f'{self.tmp}') + + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/integrations/test_trec_covid_r3.py b/integrations/test_trec_covid_r3.py deleted file mode 100644 index 84aa0c102..000000000 --- a/integrations/test_trec_covid_r3.py +++ /dev/null @@ -1,93 +0,0 @@ -# -# Pyserini: Python interface to the Anserini IR toolkit built on Lucene -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import hashlib -import os -import re -import shutil -import unittest -import json -import gzip - -import sys -sys.path.append('..') -from random import randint -from integrations.simplesearcher_checker import SimpleSearcherChecker -from pyserini.util import download_url, download_prebuilt_index - - -class TestSearchIntegration(unittest.TestCase): - def setUp(self): - - curdir = os.getcwd() - if curdir.endswith('integrations'): - self.pyserini_root = '..' - else: - self.pyserini_root = '.' - - self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}' - - # In the rare event there's a collision - if os.path.exists(self.tmp): - shutil.rmtree(self.tmp) - - os.mkdir(self.tmp) - os.mkdir(f'{self.tmp}/runs') - - self.round3_runs = { - 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt': - 'dfccc32efd58a8284ae411e5c6b27ce9', - 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt': - '7a5c27e8e052c49ff72d557051825973', - } - - download_url('https://storage.googleapis.com/neuralresearcher_data/trec_covid/data/53/covidex.t5.final.txt', - f'{self.tmp}/runs') - - - for url in self.round3_runs: - print(f'Verifying stored run at {url}...') - filename = url.split('/')[-1] - filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter - - download_url(url, self.tmp, md5=self.round3_runs[url], force=True) - self.assertTrue(os.path.exists(os.path.join(self.tmp, filename))) - print('') - - def test_bm25(self): - tmp_folder_name = self.tmp.split('/')[-1] - prebuilt_index_path = download_prebuilt_index('trec-covid-r3-abstract') - os.system(f'python {self.pyserini_root}/scripts/trec-covid-ranker.py \ - -alpha 0.5 \ - -clf lr \ - -vectorizer tfidf \ - -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round3.txt \ - -base {self.tmp}/runs/covidex.t5.final.txt \ - -tmp_base {tmp_folder_name} \ - -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round2-cumulative.txt \ - -index {prebuilt_index_path} \ - -tag covidex.r3.t5.lr \ - -output {self.tmp}/output.json') - with open(f'{self.tmp}/output.json') as json_file: - data = json.load(json_file) - self.assertEqual("0.3311\\n'", data['map']) - self.assertEqual("0.6866\\n'", data['ndcg']) - - def tearDown(self): - shutil.rmtree(self.tmp) - - -if __name__ == '__main__': - unittest.main() diff --git a/integrations/test_trec_covid_r4.py b/integrations/test_trec_covid_r4.py deleted file mode 100644 index d22604870..000000000 --- a/integrations/test_trec_covid_r4.py +++ /dev/null @@ -1,96 +0,0 @@ -# -# Pyserini: Python interface to the Anserini IR toolkit built on Lucene -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import hashlib -import os -import re -import shutil -import unittest -import json -import gzip - -import sys -sys.path.append('..') -from random import randint -from integrations.simplesearcher_checker import SimpleSearcherChecker -from pyserini.util import download_url, download_prebuilt_index - - -class TestSearchIntegration(unittest.TestCase): - def setUp(self): - - curdir = os.getcwd() - if curdir.endswith('integrations'): - self.pyserini_root = '..' - else: - self.pyserini_root = '.' - - self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}' - - # In the rare event there's a collision - if os.path.exists(self.tmp): - shutil.rmtree(self.tmp) - - os.mkdir(self.tmp) - os.mkdir(f'{self.tmp}/runs') - - self.round4_runs = { - 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt': - 'dfccc32efd58a8284ae411e5c6b27ce9', - 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt': - '7a5c27e8e052c49ff72d557051825973', - } - - download_url('https://ir.nist.gov/covidSubmit/archive/round4/covidex.r4.d2q.duot5.gz', - f'{self.tmp}/runs') - - with gzip.open(f'{self.tmp}/runs/covidex.r4.d2q.duot5.gz', 'rb') as f_in: - with open(f'{self.tmp}/runs/covidex.r4.d2q.duot5', 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) - - for url in self.round4_runs: - print(f'Verifying stored run at {url}...') - filename = url.split('/')[-1] - filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter - - download_url(url, self.tmp, md5=self.round4_runs[url], force=True) - self.assertTrue(os.path.exists(os.path.join(self.tmp, filename))) - print('') - - def test_bm25(self): - tmp_folder_name = self.tmp.split('/')[-1] - prebuilt_index_path = download_prebuilt_index('trec-covid-r4-abstract') - os.system(f'python {self.pyserini_root}/scripts/trec-covid-ranker.py \ - -alpha 0.6 \ - -clf lr \ - -vectorizer tfidf \ - -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4.txt \ - -base {self.tmp}/runs/covidex.r4.d2q.duot5 \ - -tmp_base {tmp_folder_name} \ - -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round3-cumulative.txt \ - -index {prebuilt_index_path} \ - -tag covidex.r4.d2q.duot5.lr \ - -output {self.tmp}/output.json') - with open(f'{self.tmp}/output.json') as json_file: - data = json.load(json_file) - self.assertEqual("0.3846\\n'", data['map']) - self.assertEqual("0.7745\\n'", data['ndcg']) - - def tearDown(self): - shutil.rmtree(self.tmp) - - -if __name__ == '__main__': - unittest.main() diff --git a/scripts/classifier_prf/cross_validate.py b/scripts/classifier_prf/cross_validate.py index a29b5368f..dbc44b530 100644 --- a/scripts/classifier_prf/cross_validate.py +++ b/scripts/classifier_prf/cross_validate.py @@ -28,12 +28,12 @@ def get_file_path(run_file, collection, classifier, alpha: str, rm3: bool): def get_res_file_path(run_file, collection, classifier, alpha: str, rm3: bool): - res = f'{run_file}/scripts/classifier_prf/cv/{collection}/scores_{collection}_{classifier}_A' + alpha + res = f'{run_file}/scripts/classifier_prf/cv/{collection}/{collection}_{classifier}_A' + alpha return res + get_file_extension(rm3) def get_trec_eval_cmd(anserini_root: str): - return os.path.join(anserini_root, 'eval/trec_eval.9.0.4/trec_eval') + return os.path.join(anserini_root, 'tools/eval/trec_eval.9.0.4/trec_eval') def get_qrels_path(anserini_root: str, collection: str): diff --git a/scripts/trec-covid-ranker.py b/scripts/trec-covid-ranker.py deleted file mode 100644 index a7d3c7208..000000000 --- a/scripts/trec-covid-ranker.py +++ /dev/null @@ -1,298 +0,0 @@ -import argparse -import os -import json -import sys -sys.path.append('..') -sys.path.append('../pyserini') -import subprocess - -from enum import Enum -from pyserini.vectorizer import TfidfVectorizer -from pyserini.vectorizer import BM25Vectorizer -from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import LogisticRegression -from sklearn.svm import SVC -from typing import List -from sklearn import preprocessing -from typing import List, Set - -def normalize(scores): - low = min(scores) - high = max(scores) - width = high - low - - return [(s-low)/width for s in scores] - - -def sort_dual_list(pred, docs): - zipped_lists = zip(pred, docs) - sorted_pairs = sorted(zipped_lists) - - tuples = zip(*sorted_pairs) - pred, docs = [list(tuple) for tuple in tuples] - - pred.reverse() - docs.reverse() - return pred, docs - - -def sort_str_topics_list(topics: List[str]) -> List[str]: - res = sorted([int(t) for t in topics]) - return [str(t) for t in res] - - -def get_topics_from_qrun(path: str) -> Set[str]: - res = set() - with open(path, 'r') as f: - for line in f: - res.add(line.split()[0]) - return sort_str_topics_list(res) - - -def get_lines_by_topic(path, topic, tag): - res = [] - with open(path, 'r') as f: - for line in f: - tokens = line.split() - if tokens[0] != topic: - continue - tokens[-1] = tag - new_line = ' '.join(tokens) - res.append(new_line) - - return res - - -def read_qrels(path: str): - qrels = [] - - with open(path, 'r') as f: - for line in f: - line = line.strip() - tokens = line.split() - topic = tokens[0] - doc_id = tokens[-2] - relevance = int(tokens[-1]) - qrels.append({ - 'topic': topic, - 'doc_id': doc_id, - 'relevance': relevance - }) - - return qrels - - -def get_doc_to_id_from_qrun_by_topic(path: str, topic: str): - res = {} - with open(path, 'r') as f: - for line in f: - tokens = line.strip().split() - t = tokens[0] - if topic != t: - continue - doc_id = tokens[2] - score = float(tokens[-2]) - res[doc_id] = score - - return res - - -def get_docs_from_qrun_by_topic(path: str, topic: str): - x, y = [], [] - with open(path, 'r') as f: - for line in f: - tokens = line.strip().split() - t = tokens[0] - if topic != t: - continue - doc_id = tokens[2] - score = float(tokens[-2]) - x.append(doc_id) - y.append(score) - - return x, y - - -def get_X_Y_from_qrels_by_topic(path: str, topic: str, R: List[int]): - # always include topic 0 - R.append(0) - qrels = [qrel for qrel in read_qrels(path) if qrel['topic'] == topic and qrel['relevance'] in R] - x, y = [], [] - for pack in qrels: - x.append(pack['doc_id']) - label = 0 if pack['relevance'] == 0 else 1 - y.append(label) - - return x, y - - -class SpecterVectorizer: - def __init__(self): - path = "data/specter.csv" - self.vectors = {} - - with open(path, 'r') as f: - for line in f: - tokens = line.strip().split(',') - doc_id = tokens[0] - vector = [float(item) for item in tokens[1:]] - self.vectors[doc_id] = vector - - def get_vectors(self, doc_ids: List[str]): - res = [] - - for doc_id in doc_ids: - if doc_id in self.vectors: - res.append(self.vectors[doc_id]) - else: - print(f'{doc_id} not found') - - return preprocessing.normalize(res) - - -class ClassifierType(Enum): - SVM = 'svm' - LR = 'lr' - NB = 'nb' - - -ClassifierStr = { - ClassifierType.SVM: 'svm', - ClassifierType.LR: 'lr', - ClassifierType.NB: 'nb', -} - - -class VectorizerType(Enum): - TFIDF = 'tfidf' - BM25 = 'bm25' - SPECTER = 'specter' - - -VectorizerStr = { - VectorizerType.TFIDF: 'tfidf', - VectorizerType.BM25: 'bm25', - VectorizerType.SPECTER: 'specter', -} - - -def evaluate(qrels_path: str, run_path: str, options: str = ''): - curdir = os.getcwd() - if curdir.endswith('integrations'): - anserini_root = '../../anserini' - else: - anserini_root = '../anserini' - prefix = f"{anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -c -M1000 -m all_trec {qrels_path}" - cmd1 = f"{prefix} {run_path} {options} | grep 'ndcg_cut_20 '" - cmd2 = f"{prefix} {run_path} {options} | grep 'map '" - ndcg_score = str(subprocess.check_output(cmd1, shell=True)).split('\\t')[-1] - map_score = str(subprocess.check_output(cmd2, shell=True)).split('\\t')[-1] - return str(map_score),str(ndcg_score) - - -def rank(new_qrels: str, base: str,tmp_base:str, qrels_path: str, lucene_index_path: str, R: List[int], score_path: str, alpha: float, clf_type: ClassifierType, vec_type: VectorizerType, tag: str): - # build output path - base_str = base.split('/')[-1] - R_str = ''.join([str(i) for i in R]) - curdir = os.getcwd() - if curdir.endswith('integrations'): - output_path = f'{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt' - else: - output_path = f'integrations/{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt' - print(f'Output -> {output_path}') - os.system('mkdir -p runs') - - vectorizer = None - if vec_type == VectorizerType.TFIDF: - vectorizer = TfidfVectorizer(lucene_index_path, min_df=5) - elif vec_type == VectorizerType.SPECTER: - base += '.specter' - qrels_path += '.specter' - vectorizer = SpecterVectorizer() - elif vec_type == VectorizerType.BM25: - vectorizer = BM25Vectorizer(lucene_index_path, min_df=5) - else: - print('invalid vectorizer') - exit() - - f = open(output_path, 'w+') - - skipped_topics = set() - topics = get_topics_from_qrun(base) - for topic in topics: - train_docs, train_labels = get_X_Y_from_qrels_by_topic(qrels_path, topic, R) - if len(train_docs) == 0: - print(f'[topic][{topic}] skipped') - skipped_topics.add(topic) - continue - - print(f'[topic][{topic}] eligible train docs {len(train_docs)}') - - clf = None - if clf_type == ClassifierType.NB: - clf = MultinomialNB() - elif clf_type == ClassifierType.LR: - clf = LogisticRegression() - elif clf_type == ClassifierType.SVM: - clf = SVC(kernel='linear', probability=True) - else: - print('ClassifierType not supported') - exit() - - train_vectors = vectorizer.get_vectors(train_docs) - clf.fit(train_vectors, train_labels) - - test_docs, base_scores = get_docs_from_qrun_by_topic(base, topic) - print(f'[topic][{topic}] eligible test docs {len(test_docs)}') - test_vectors = vectorizer.get_vectors(test_docs) - - rank_scores = clf.predict_proba(test_vectors) - rank_scores = [row[1] for row in rank_scores] - - rank_scores = normalize(rank_scores) - base_scores = normalize(base_scores) - - preds = [a * alpha + b * (1-alpha) for a, b in zip(rank_scores, base_scores)] - preds, docs = sort_dual_list(preds, test_docs) - - for index, (score, doc_id) in enumerate(zip(preds, docs)): - rank = index + 1 - f.write(f'{topic} Q0 {doc_id} {rank} {score} {tag}\n') - - for topic in sort_str_topics_list(list(skipped_topics)): - lines = get_lines_by_topic(base, topic, tag) - print(f'Copying over skipped topic {topic} with {len(lines)} lines') - for line in lines: - f.write(f'{line}\n') - - f.close() - map_score,ndcg_score = evaluate(new_qrels, output_path) - with open(score_path, 'w') as outfile: - json.dump({'map':map_score,'ndcg':ndcg_score}, outfile) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use tfidf vectorizer on cord-19 dataset with ccrf technique') - parser.add_argument('-tag', type=str, default="interpolation", - metavar="tag_name", help='tag name for resulting Qrun') - parser.add_argument('-new_qrels', type=str, default="data/qrels-rnd1+2+3+4.txt", - metavar="path_to_new_qrels", help='path to new_qrels file') - parser.add_argument('-base', type=str, default="data/covidex.t5.final.txt", - metavar="path_to_base_run", help='path to base run') - parser.add_argument('-tmp_base', type=str, default="tmp101}", - metavar="tmp file folder name", help='"tmp file folder name') - parser.add_argument('-qrels', type=str, default="data/qrels-rnd1+2.txt", - metavar="path_to_qrels", help='path to qrels file') - parser.add_argument('-index', type=str, default="data/lucene-index-cord19-abstract-2020-05-19", - metavar="path_to_lucene_index", help='path to lucene index folder') - parser.add_argument('-output', type=str, default="data/output.json", - metavar="path_to_base_run", help='the path to map and ndcg scores') - parser.add_argument('-alpha', type=float, required=True, help='alpha value for interpolation') - parser.add_argument('-clf', type=ClassifierType, required=True, help='which classifier to use') - parser.add_argument('-vectorizer', type=VectorizerType, required=True, help='which vectorizer to use') - args = parser.parse_args() - - R = [1, 2] - print('Using base run:', args.base) - rank(args.new_qrels, args.base, args.tmp_base, args.qrels, args.index, R, args.output, args.alpha, args.clf, args.vectorizer, args.tag) From 1d61a3161f7a2ad09c190b56d58058cc8e949453 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Sun, 18 Apr 2021 13:00:15 -0400 Subject: [PATCH 15/20] allow user to change different python version --- integrations/test_simplesearcher_check_prf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/test_simplesearcher_check_prf.py b/integrations/test_simplesearcher_check_prf.py index 054741ca1..09060edda 100644 --- a/integrations/test_simplesearcher_check_prf.py +++ b/integrations/test_simplesearcher_check_prf.py @@ -69,7 +69,7 @@ def test_core18(self): self.assertAlmostEqual(score, 0.2495, delta=0.0001) def test_core18_lr(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core18 --output {self.tmp}/core18_lr.txt --classifier lr ') @@ -85,7 +85,7 @@ def test_core18_lr(self): def test_core18_svm(self): os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core18 --output {self.tmp}/core18_svm.txt --classifier svm ') From c4a1fb46bb67af3bfcfd6ee6f287cdd761e76fc5 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Wed, 21 Apr 2021 00:02:19 -0400 Subject: [PATCH 16/20] add unit tests for core17, robust04, robust05 for prf --- integrations/simplesearcher_checker.py | 2 +- integrations/simplesearcher_score_checker.py | 64 ++ .../test_simplesearcher_check_core17.py | 4 +- .../test_simplesearcher_check_core18.py | 4 +- integrations/test_simplesearcher_check_prf.py | 578 +++++++++++++++++- .../test_simplesearcher_check_robust04.py | 4 +- .../test_simplesearcher_check_robust05.py | 4 +- 7 files changed, 638 insertions(+), 22 deletions(-) create mode 100644 integrations/simplesearcher_score_checker.py diff --git a/integrations/simplesearcher_checker.py b/integrations/simplesearcher_checker.py index 889bd2a3d..5b591177d 100644 --- a/integrations/simplesearcher_checker.py +++ b/integrations/simplesearcher_checker.py @@ -19,7 +19,7 @@ from typing import List -class SimpleSearcherChecker: +class SimpleSercherAnseriniMatchChecker: def __init__(self, anserini_root: str, index: str, topics: str, pyserini_topics: str, qrels: str): self.anserini_root = anserini_root self.index_path = index diff --git a/integrations/simplesearcher_score_checker.py b/integrations/simplesearcher_score_checker.py new file mode 100644 index 000000000..8c20a5039 --- /dev/null +++ b/integrations/simplesearcher_score_checker.py @@ -0,0 +1,64 @@ +# +# Pyserini: Python interface to the Anserini IR toolkit built on Lucene +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import filecmp +import os +from typing import List +from integrations.utils import run_command, parse_score + + +class SimpleSearcherScoreChecker: + def __init__(self, index: str, topics: str, pyserini_topics: str, qrels: str, eval:str): + self.index_path = index + self.topics = topics + self.qrels = qrels + self.pyserini_topics = pyserini_topics + + self.pyserini_base_cmd = 'python3 -m pyserini.search' + + self.eval_base_cmd = eval + + @staticmethod + def _cleanup(files: List[str]): + for file in files: + if os.path.exists(file): + os.remove(file) + + def run(self, runtag: str, pyserini_extras: str, actualscore: float): + print('-------------------------') + print(f'Running {runtag}:') + print('-------------------------') + + pyserini_output = f'verify.pyserini.{runtag}.txt' + + pyserini_cmd = f'{self.pyserini_base_cmd} --index {self.index_path} ' \ + + f'--topics {self.pyserini_topics} --output {pyserini_output} {pyserini_extras}' + + status = os.system(pyserini_cmd) + if not status == 0: + return False + + eval_cmd = f'{self.eval_base_cmd} {self.qrels} {pyserini_output}' + status = os.system(eval_cmd) + if not status == 0: + return False + stdout, stderr = run_command(eval_cmd) + score = parse_score(stdout, "map") + if actualscore !=score: + return False + return True + + diff --git a/integrations/test_simplesearcher_check_core17.py b/integrations/test_simplesearcher_check_core17.py index 5e7b71401..fc799a33a 100644 --- a/integrations/test_simplesearcher_check_core17.py +++ b/integrations/test_simplesearcher_check_core17.py @@ -17,7 +17,7 @@ import os import unittest -from integrations.simplesearcher_checker import SimpleSearcherChecker +from integrations.simplesearcher_checker import SimpleSercherAnseriniMatchChecker class TestSearchIntegration(unittest.TestCase): @@ -31,7 +31,7 @@ def setUp(self): anserini_root = '../anserini' pyserini_root = '.' - self.checker = SimpleSearcherChecker( + self.checker = SimpleSercherAnseriniMatchChecker( anserini_root=anserini_root, index=os.path.join(anserini_root, 'indexes/lucene-index.core17.pos+docvectors+raw'), topics=os.path.join(pyserini_root, 'tools/topics-and-qrels/topics.core17.txt'), diff --git a/integrations/test_simplesearcher_check_core18.py b/integrations/test_simplesearcher_check_core18.py index 911a4ee60..d35536563 100644 --- a/integrations/test_simplesearcher_check_core18.py +++ b/integrations/test_simplesearcher_check_core18.py @@ -17,7 +17,7 @@ import os import unittest -from integrations.simplesearcher_checker import SimpleSearcherChecker +from integrations.simplesearcher_checker import SimpleSercherAnseriniMatchChecker class TestSearchIntegration(unittest.TestCase): @@ -31,7 +31,7 @@ def setUp(self): anserini_root = '../anserini' pyserini_root = '.' - self.checker = SimpleSearcherChecker( + self.checker = SimpleSercherAnseriniMatchChecker( anserini_root=anserini_root, index=os.path.join(anserini_root, 'indexes/lucene-index.core18.pos+docvectors+raw'), topics=os.path.join(pyserini_root, 'tools/topics-and-qrels/topics.core18.txt'), diff --git a/integrations/test_simplesearcher_check_prf.py b/integrations/test_simplesearcher_check_prf.py index 09060edda..6a58f8c4d 100644 --- a/integrations/test_simplesearcher_check_prf.py +++ b/integrations/test_simplesearcher_check_prf.py @@ -23,7 +23,7 @@ import tarfile from pyserini.util import download_url from integrations.utils import run_command, parse_score -from integrations.simplesearcher_checker import SimpleSearcherChecker +from integrations.simplesearcher_score_checker import SimpleSearcherScoreChecker class TestSearchIntegration(unittest.TestCase): @@ -38,38 +38,219 @@ def setUp(self): self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}' - self.checker = SimpleSearcherChecker( - anserini_root=self.anserini_root, + self.checker18 = SimpleSearcherScoreChecker( index=os.path.join(self.anserini_root, 'indexes/lucene-index.core18.pos+docvectors+raw'), topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.core18.txt'), pyserini_topics='core18', - qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt')) + qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt'), + eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') if os.path.exists(self.tmp): shutil.rmtree(self.tmp) else: os.mkdir(self.tmp) - download_url('https://www.dropbox.com/s/6b81d5na2iuyvnc/core18.tar.gz?dl=1', f'{self.pyserini_root}/integrations') + download_url('https://www.dropbox.com/sh/xo9v2glmwy3jrtn/AADdx7gdes6_pPAQKDPjgWDSa/core18.tar.gz?dl=1', + f'{self.pyserini_root}/integrations') if os.path.exists(f'{self.tmp}/core18') == False: tar = tarfile.open(f"{self.pyserini_root}/integrations/core18.tar.gz", "r:gz") tar.extractall(path=f'{self.tmp}') tar.close() - def test_core18(self): + self.checker17 = SimpleSearcherScoreChecker( + index=os.path.join(self.anserini_root, 'indexes/lucene-index.core17.pos+docvectors+raw'), + topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.core17.txt'), + pyserini_topics='core17', + qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core17.txt'), + eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') + + download_url('https://www.dropbox.com/sh/xo9v2glmwy3jrtn/AACYJ5rABTNzn1c4m1WyPCwva/core17.tar.gz?dl=1', + f'{self.pyserini_root}/integrations') + + if os.path.exists(f'{self.tmp}/core17') == False: + tar = tarfile.open(f"{self.pyserini_root}/integrations/core17.tar.gz", "r:gz") + tar.extractall(path=f'{self.tmp}') + tar.close() + + self.checkerrobust04 = SimpleSearcherScoreChecker( + index=os.path.join(self.anserini_root, 'indexes/lucene-index.robust04.pos+docvectors+raw'), + topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.robust04.txt'), + pyserini_topics='robust04', + qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust04.txt'), + eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') + + download_url('https://www.dropbox.com/sh/xo9v2glmwy3jrtn/AADqN7nCCvansUvlHhdKAl18a/robust04.tar.gz?dl=1', + f'{self.pyserini_root}/integrations') + + if os.path.exists(f'{self.tmp}/robust04') == False: + tar = tarfile.open(f"{self.pyserini_root}/integrations/robust04.tar.gz", "r:gz") + tar.extractall(path=f'{self.tmp}') + tar.close() + + self.checkerrobust05 = SimpleSearcherScoreChecker( + index=os.path.join(self.anserini_root, 'indexes/lucene-index.robust05.pos+docvectors+raw'), + topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.robust05.txt'), + pyserini_topics='robust05', + qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust05.txt'), + eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') + + download_url('https://www.dropbox.com/sh/xo9v2glmwy3jrtn/AADtpKu2bx4Imp05Hm4HUtsYa/robust05.tar.gz?dl=1', + f'{self.pyserini_root}/integrations') + + if os.path.exists(f'{self.tmp}/robust05') == False: + tar = tarfile.open(f"{self.pyserini_root}/integrations/robust05.tar.gz", "r:gz") + tar.extractall(path=f'{self.tmp}') + tar.close() + + + # Core17 + + def test_core17(self): + self.assertTrue(self.checker17.run('core17_bm25', '--bm25', 0.2087)) + + def test_core17_rm3(self): + self.assertTrue(self.checker17.run('core17_bm25', '--bm25 --rm3',0.2823)) + + def test_core17_lr(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core17 --output {self.tmp}/core17_lr.txt --classifier lr ') + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ - {self.anserini_root}/runs/run.core18.bm25.topics.core18.txt' + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_lr.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2462, delta=0.0001) + + def test_core17_lr_rm3(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core17 --output {self.tmp}/core17_lr_rm3.txt --classifier lr -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_lr_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2906, delta=0.0001) + + def test_core17_svm(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core17 --output {self.tmp}/core17_svm.txt --classifier svm ') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_svm.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2367, delta=0.0001) + + def test_core17_svm_rm3(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core17 --output {self.tmp}/core17_svm_rm3.txt --classifier svm -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_svm_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2940, delta=0.0001) + + def test_core17_avg(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core17 --output {self.tmp}/core17_lr+svm.txt --classifier lr+svm ') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_lr+svm.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2426, delta=0.0001) + + def test_core17_avg_rm3(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core17 --output {self.tmp}/core17_lr+svm_rm3.txt --classifier lr+svm -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_lr+svm_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2952, delta=0.0001) + + def test_core17_rrf(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core17 --output {self.tmp}/core17_rrf.txt --classifier rrf') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_rrf.txt' status = os.system(cmd1) stdout, stderr = run_command(cmd1) score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2495, delta=0.0001) + self.assertAlmostEqual(score, 0.2433, delta=0.0001) + + def test_core17_rrf_rm3(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core17 --output {self.tmp}/core17_rrf_rm3.txt --classifier rrf -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_rrf_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2951, delta=0.0001) + + # Core18 + def test_core18(self): + self.assertTrue(self.checker18.run('core18_bm25', '--bm25',0.2495)) + + def test_core18_rm3(self): + self.assertTrue(self.checker18.run('core18_bm25', '--bm25 --rm3',0.3135)) def test_core18_lr(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core18 --output {self.tmp}/core18_lr.txt --classifier lr ') @@ -83,9 +264,23 @@ def test_core18_lr(self): self.assertEqual(stderr, '') self.assertAlmostEqual(score, 0.2837, delta=0.0001) + def test_core18_lr_rm3(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core18 --output {self.tmp}/core18_lr_rm3.txt --classifier lr -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_lr_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.3195, delta=0.0001) + def test_core18_svm(self): - os.system( - f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core18 --output {self.tmp}/core18_svm.txt --classifier svm ') @@ -95,11 +290,368 @@ def test_core18_svm(self): status = os.system(cmd1) stdout, stderr = run_command(cmd1) score = parse_score(stdout, "map") - os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') self.assertEqual(status, 0) self.assertEqual(stderr, '') self.assertAlmostEqual(score, 0.2786, delta=0.0001) + def test_core18_svm_rm3(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core18 --output {self.tmp}/core18_svm_rm3.txt --classifier svm -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_svm_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.3220, delta=0.0001) + + def test_core18_avg(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core18 --output {self.tmp}/core18_lr+svm.txt --classifier lr+svm ') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_lr+svm.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2821, delta=0.0001) + + def test_core18_avg_rm3(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core18 --output {self.tmp}/core18_lr+svm_rm3.txt --classifier lr+svm -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_lr+svm_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.3200, delta=0.0001) + + def test_core18_rrf(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core18 --output {self.tmp}/core18_rrf.txt --classifier rrf') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_rrf.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2871, delta=0.0001) + + def test_core18_rrf_rm3(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core18 --output {self.tmp}/core18_rrf_rm3.txt --classifier rrf -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_rrf_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.3204, delta=0.0001) + + # robust04 + def test_robust04(self): + self.assertTrue(self.checkerrobust04.run('robust04_bm25', '--bm25',0.2531)) + + def test_robust04_rm3(self): + self.assertTrue(self.checkerrobust04.run('robust04_bm25_rm3', '--bm25 --rm3',0.2903)) + + def test_robust04_lr(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust04 --output {self.tmp}/robust04_lr.txt --classifier lr ') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_lr.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2721, delta=0.0001) + + def test_robust04_lr_rm3(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust04 --output {self.tmp}/robust04_lr_rm3.txt --classifier lr -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_lr_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2961, delta=0.0001) + + def test_robust04_svm(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust04 --output {self.tmp}/robust04_svm.txt --classifier svm ') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_svm.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2716, delta=0.0001) + + def test_robust04_svm_rm3(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust04 --output {self.tmp}/robust04_svm_rm3.txt --classifier svm -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_svm_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2963, delta=0.0001) + + def test_robust04_avg(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust04 --output {self.tmp}/robust04_lr+svm.txt --classifier lr+svm ') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_lr+svm.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2745, delta=0.0001) + + def test_robust04_avg_rm3(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust04 --output {self.tmp}/robust04_lr+svm_rm3.txt --classifier lr+svm -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_lr+svm_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2980, delta=0.0001) + + def test_robust04_rrf(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust04 --output {self.tmp}/robust04_rrf.txt --classifier rrf') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_rrf.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2740, delta=0.0001) + + def test_robust04_rrf_rm3(self): + os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust04 --output {self.tmp}/robust04_rrf_rm3.txt --classifier rrf -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_rrf_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2975, delta=0.0001) + + # robust05 + + def test_robust05(self): + self.assertTrue(self.checkerrobust05.run('robust05_bm25', '--bm25', 0.2032)) + + def test_robust05_rm3(self): + self.assertTrue(self.checkerrobust05.run('robust05_bm25_rm3', '--bm25 --rm3', 0.2602)) + + def test_robust05_lr(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust05 --output {self.tmp}/robust05_lr.txt --classifier lr ') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_lr.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2476, delta=0.0001) + + def test_robust05_lr_rm3(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust05 --output {self.tmp}/robust05_lr_rm3.txt --classifier lr -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_lr_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2845, delta=0.0001) + + def test_robust05_svm(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust05 --output {self.tmp}/robust05_svm.txt --classifier svm ') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_svm.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2454, delta=0.0001) + + def test_robust05_svm_rm3(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust05 --output {self.tmp}/robust05_svm_rm3.txt --classifier svm -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_svm_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + os.remove(f'{self.pyserini_root}/integrations/core17.tar.gz') + os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + os.remove(f'{self.pyserini_root}/integrations/robust04.tar.gz') + os.remove(f'{self.pyserini_root}/integrations/robust05.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2850, delta=0.0001) + + def test_robust05_avg(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust05 --output {self.tmp}/robust05_lr+svm.txt --classifier lr+svm ') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_lr+svm.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2485, delta=0.0001) + + def test_robust05_avg_rm3(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust05 --output {self.tmp}/robust05_lr+svm_rm3.txt --classifier lr+svm -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_lr+svm_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2860, delta=0.0001) + + def test_robust05_rrf(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust05 --output {self.tmp}/robust05_rrf.txt --classifier rrf') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_rrf.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2490, delta=0.0001) + + def test_robust05_rrf_rm3(self): + os.system( + f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection robust05 --output {self.tmp}/robust05_rrf_rm3.txt --classifier rrf -rm3') + + cmd1 = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_rrf_rm3.txt' + status = os.system(cmd1) + stdout, stderr = run_command(cmd1) + score = parse_score(stdout, "map") + # os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2870, delta=0.0001) + + def tearDown(self): shutil.rmtree(f'{self.tmp}') diff --git a/integrations/test_simplesearcher_check_robust04.py b/integrations/test_simplesearcher_check_robust04.py index c828ad557..26fa5fcae 100644 --- a/integrations/test_simplesearcher_check_robust04.py +++ b/integrations/test_simplesearcher_check_robust04.py @@ -17,7 +17,7 @@ import os import unittest -from integrations.simplesearcher_checker import SimpleSearcherChecker +from integrations.simplesearcher_checker import SimpleSercherAnseriniMatchChecker class TestSearchIntegration(unittest.TestCase): @@ -31,7 +31,7 @@ def setUp(self): anserini_root = '../anserini' pyserini_root = '.' - self.checker = SimpleSearcherChecker( + self.checker = SimpleSercherAnseriniMatchChecker( anserini_root=anserini_root, index=os.path.join(anserini_root, 'indexes/lucene-index.robust04.pos+docvectors+raw'), topics=os.path.join(pyserini_root, 'tools/topics-and-qrels/topics.robust04.txt'), diff --git a/integrations/test_simplesearcher_check_robust05.py b/integrations/test_simplesearcher_check_robust05.py index f55c70e38..0a2870856 100644 --- a/integrations/test_simplesearcher_check_robust05.py +++ b/integrations/test_simplesearcher_check_robust05.py @@ -17,7 +17,7 @@ import os import unittest -from integrations.simplesearcher_checker import SimpleSearcherChecker +from integrations.simplesearcher_checker import SimpleSercherAnseriniMatchChecker class TestSearchIntegration(unittest.TestCase): @@ -31,7 +31,7 @@ def setUp(self): anserini_root = '../anserini' pyserini_root = '.' - self.checker = SimpleSearcherChecker( + self.checker = SimpleSercherAnseriniMatchChecker( anserini_root=anserini_root, index=os.path.join(anserini_root, 'indexes/lucene-index.robust05.pos+docvectors+raw'), topics=os.path.join(pyserini_root, 'tools/topics-and-qrels/topics.robust05.txt'), From d4895b4052de59d1c6a6b23b0bea5648b732772a Mon Sep 17 00:00:00 2001 From: Yuqi Date: Wed, 21 Apr 2021 22:29:29 -0400 Subject: [PATCH 17/20] add exception if the run file tar.gz is not downloaded properly --- integrations/test_simplesearcher_check_prf.py | 158 +++++++++--------- 1 file changed, 75 insertions(+), 83 deletions(-) diff --git a/integrations/test_simplesearcher_check_prf.py b/integrations/test_simplesearcher_check_prf.py index 6a58f8c4d..1d8ab82ab 100644 --- a/integrations/test_simplesearcher_check_prf.py +++ b/integrations/test_simplesearcher_check_prf.py @@ -1,5 +1,5 @@ # -# Pyserini: Python interface to the Anserini IR toolkit built on Lucene +# Pyserini: python interface to the Anserini IR toolkit built on Lucene # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -39,24 +39,27 @@ def setUp(self): self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}' self.checker18 = SimpleSearcherScoreChecker( - index=os.path.join(self.anserini_root, 'indexes/lucene-index.core18.pos+docvectors+raw'), - topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.core18.txt'), - pyserini_topics='core18', - qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt'), - eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') + index=os.path.join(self.anserini_root, 'indexes/lucene-index.core18.pos+docvectors+raw'), + topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.core18.txt'), + pyserini_topics='core18', + qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt'), + eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') + if os.path.exists(self.tmp): shutil.rmtree(self.tmp) else: os.mkdir(self.tmp) - download_url('https://www.dropbox.com/sh/xo9v2glmwy3jrtn/AADdx7gdes6_pPAQKDPjgWDSa/core18.tar.gz?dl=1', - f'{self.pyserini_root}/integrations') - - if os.path.exists(f'{self.tmp}/core18') == False: - tar = tarfile.open(f"{self.pyserini_root}/integrations/core18.tar.gz", "r:gz") - tar.extractall(path=f'{self.tmp}') - tar.close() + try: + if os.path.exists(f'{self.tmp}/core18') == False: + tar = tarfile.open(f"{self.pyserini_root}/integrations/core18.tar.gz", "r:gz") + tar.extractall(path=f'{self.tmp}') + tar.close() + except: + shutil.rmtree(f'{self.tmp}') + print(f'core18.tar.gz is not saved in {self.pyserini_root}/integrations') + raise self.checker17 = SimpleSearcherScoreChecker( index=os.path.join(self.anserini_root, 'indexes/lucene-index.core17.pos+docvectors+raw'), @@ -65,13 +68,15 @@ def setUp(self): qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core17.txt'), eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') - download_url('https://www.dropbox.com/sh/xo9v2glmwy3jrtn/AACYJ5rABTNzn1c4m1WyPCwva/core17.tar.gz?dl=1', - f'{self.pyserini_root}/integrations') - - if os.path.exists(f'{self.tmp}/core17') == False: - tar = tarfile.open(f"{self.pyserini_root}/integrations/core17.tar.gz", "r:gz") - tar.extractall(path=f'{self.tmp}') - tar.close() + try: + if os.path.exists(f'{self.tmp}/core17') == False: + tar = tarfile.open(f"{self.pyserini_root}/integrations/core17.tar.gz", "r:gz") + tar.extractall(path=f'{self.tmp}') + tar.close() + except: + shutil.rmtree(f'{self.tmp}') + print(f'core17.tar.gz is not saved in {self.pyserini_root}/integrations') + raise self.checkerrobust04 = SimpleSearcherScoreChecker( index=os.path.join(self.anserini_root, 'indexes/lucene-index.robust04.pos+docvectors+raw'), @@ -80,13 +85,15 @@ def setUp(self): qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust04.txt'), eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') - download_url('https://www.dropbox.com/sh/xo9v2glmwy3jrtn/AADqN7nCCvansUvlHhdKAl18a/robust04.tar.gz?dl=1', - f'{self.pyserini_root}/integrations') - - if os.path.exists(f'{self.tmp}/robust04') == False: - tar = tarfile.open(f"{self.pyserini_root}/integrations/robust04.tar.gz", "r:gz") - tar.extractall(path=f'{self.tmp}') - tar.close() + try: + if os.path.exists(f'{self.tmp}/robust04') == False: + tar = tarfile.open(f"{self.pyserini_root}/integrations/robust04.tar.gz", "r:gz") + tar.extractall(path=f'{self.tmp}') + tar.close() + except: + shutil.rmtree(f'{self.tmp}') + print(f'robust04.tar.gz is not saved in {self.pyserini_root}/integrations') + raise self.checkerrobust05 = SimpleSearcherScoreChecker( index=os.path.join(self.anserini_root, 'indexes/lucene-index.robust05.pos+docvectors+raw'), @@ -95,13 +102,15 @@ def setUp(self): qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust05.txt'), eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') - download_url('https://www.dropbox.com/sh/xo9v2glmwy3jrtn/AADtpKu2bx4Imp05Hm4HUtsYa/robust05.tar.gz?dl=1', - f'{self.pyserini_root}/integrations') - - if os.path.exists(f'{self.tmp}/robust05') == False: - tar = tarfile.open(f"{self.pyserini_root}/integrations/robust05.tar.gz", "r:gz") - tar.extractall(path=f'{self.tmp}') - tar.close() + try: + if os.path.exists(f'{self.tmp}/robust05') == False: + tar = tarfile.open(f"{self.pyserini_root}/integrations/robust05.tar.gz", "r:gz") + tar.extractall(path=f'{self.tmp}') + tar.close() + except: + shutil.rmtree(f'{self.tmp}') + print(f'robust05.tar.gz is not saved in {self.pyserini_root}/integrations') + raise # Core17 @@ -113,7 +122,7 @@ def test_core17_rm3(self): self.assertTrue(self.checker17.run('core17_bm25', '--bm25 --rm3',0.2823)) def test_core17_lr(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core17 --output {self.tmp}/core17_lr.txt --classifier lr ') @@ -128,7 +137,7 @@ def test_core17_lr(self): self.assertAlmostEqual(score, 0.2462, delta=0.0001) def test_core17_lr_rm3(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core17 --output {self.tmp}/core17_lr_rm3.txt --classifier lr -rm3') @@ -143,8 +152,7 @@ def test_core17_lr_rm3(self): self.assertAlmostEqual(score, 0.2906, delta=0.0001) def test_core17_svm(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core17 --output {self.tmp}/core17_svm.txt --classifier svm ') @@ -160,8 +168,7 @@ def test_core17_svm(self): self.assertAlmostEqual(score, 0.2367, delta=0.0001) def test_core17_svm_rm3(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core17 --output {self.tmp}/core17_svm_rm3.txt --classifier svm -rm3') @@ -176,8 +183,7 @@ def test_core17_svm_rm3(self): self.assertAlmostEqual(score, 0.2940, delta=0.0001) def test_core17_avg(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core17 --output {self.tmp}/core17_lr+svm.txt --classifier lr+svm ') @@ -193,8 +199,7 @@ def test_core17_avg(self): self.assertAlmostEqual(score, 0.2426, delta=0.0001) def test_core17_avg_rm3(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core17 --output {self.tmp}/core17_lr+svm_rm3.txt --classifier lr+svm -rm3') @@ -210,7 +215,7 @@ def test_core17_avg_rm3(self): self.assertAlmostEqual(score, 0.2952, delta=0.0001) def test_core17_rrf(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core17 --output {self.tmp}/core17_rrf.txt --classifier rrf') @@ -226,8 +231,7 @@ def test_core17_rrf(self): self.assertAlmostEqual(score, 0.2433, delta=0.0001) def test_core17_rrf_rm3(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core17 --output {self.tmp}/core17_rrf_rm3.txt --classifier rrf -rm3') @@ -250,7 +254,7 @@ def test_core18_rm3(self): self.assertTrue(self.checker18.run('core18_bm25', '--bm25 --rm3',0.3135)) def test_core18_lr(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core18 --output {self.tmp}/core18_lr.txt --classifier lr ') @@ -265,7 +269,7 @@ def test_core18_lr(self): self.assertAlmostEqual(score, 0.2837, delta=0.0001) def test_core18_lr_rm3(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core18 --output {self.tmp}/core18_lr_rm3.txt --classifier lr -rm3') @@ -280,7 +284,7 @@ def test_core18_lr_rm3(self): self.assertAlmostEqual(score, 0.3195, delta=0.0001) def test_core18_svm(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core18 --output {self.tmp}/core18_svm.txt --classifier svm ') @@ -296,7 +300,7 @@ def test_core18_svm(self): self.assertAlmostEqual(score, 0.2786, delta=0.0001) def test_core18_svm_rm3(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core18 --output {self.tmp}/core18_svm_rm3.txt --classifier svm -rm3') @@ -311,7 +315,7 @@ def test_core18_svm_rm3(self): self.assertAlmostEqual(score, 0.3220, delta=0.0001) def test_core18_avg(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core18 --output {self.tmp}/core18_lr+svm.txt --classifier lr+svm ') @@ -327,7 +331,7 @@ def test_core18_avg(self): self.assertAlmostEqual(score, 0.2821, delta=0.0001) def test_core18_avg_rm3(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core18 --output {self.tmp}/core18_lr+svm_rm3.txt --classifier lr+svm -rm3') @@ -343,7 +347,7 @@ def test_core18_avg_rm3(self): self.assertAlmostEqual(score, 0.3200, delta=0.0001) def test_core18_rrf(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core18 --output {self.tmp}/core18_rrf.txt --classifier rrf') @@ -359,7 +363,7 @@ def test_core18_rrf(self): self.assertAlmostEqual(score, 0.2871, delta=0.0001) def test_core18_rrf_rm3(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection core18 --output {self.tmp}/core18_rrf_rm3.txt --classifier rrf -rm3') @@ -382,7 +386,7 @@ def test_robust04_rm3(self): self.assertTrue(self.checkerrobust04.run('robust04_bm25_rm3', '--bm25 --rm3',0.2903)) def test_robust04_lr(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust04 --output {self.tmp}/robust04_lr.txt --classifier lr ') @@ -397,7 +401,7 @@ def test_robust04_lr(self): self.assertAlmostEqual(score, 0.2721, delta=0.0001) def test_robust04_lr_rm3(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust04 --output {self.tmp}/robust04_lr_rm3.txt --classifier lr -rm3') @@ -412,7 +416,7 @@ def test_robust04_lr_rm3(self): self.assertAlmostEqual(score, 0.2961, delta=0.0001) def test_robust04_svm(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust04 --output {self.tmp}/robust04_svm.txt --classifier svm ') @@ -428,7 +432,7 @@ def test_robust04_svm(self): self.assertAlmostEqual(score, 0.2716, delta=0.0001) def test_robust04_svm_rm3(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust04 --output {self.tmp}/robust04_svm_rm3.txt --classifier svm -rm3') @@ -443,7 +447,7 @@ def test_robust04_svm_rm3(self): self.assertAlmostEqual(score, 0.2963, delta=0.0001) def test_robust04_avg(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust04 --output {self.tmp}/robust04_lr+svm.txt --classifier lr+svm ') @@ -459,7 +463,7 @@ def test_robust04_avg(self): self.assertAlmostEqual(score, 0.2745, delta=0.0001) def test_robust04_avg_rm3(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust04 --output {self.tmp}/robust04_lr+svm_rm3.txt --classifier lr+svm -rm3') @@ -475,7 +479,7 @@ def test_robust04_avg_rm3(self): self.assertAlmostEqual(score, 0.2980, delta=0.0001) def test_robust04_rrf(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust04 --output {self.tmp}/robust04_rrf.txt --classifier rrf') @@ -491,7 +495,7 @@ def test_robust04_rrf(self): self.assertAlmostEqual(score, 0.2740, delta=0.0001) def test_robust04_rrf_rm3(self): - os.system(f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust04 --output {self.tmp}/robust04_rrf_rm3.txt --classifier rrf -rm3') @@ -515,8 +519,7 @@ def test_robust05_rm3(self): self.assertTrue(self.checkerrobust05.run('robust05_bm25_rm3', '--bm25 --rm3', 0.2602)) def test_robust05_lr(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust05 --output {self.tmp}/robust05_lr.txt --classifier lr ') @@ -531,8 +534,7 @@ def test_robust05_lr(self): self.assertAlmostEqual(score, 0.2476, delta=0.0001) def test_robust05_lr_rm3(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust05 --output {self.tmp}/robust05_lr_rm3.txt --classifier lr -rm3') @@ -547,8 +549,7 @@ def test_robust05_lr_rm3(self): self.assertAlmostEqual(score, 0.2845, delta=0.0001) def test_robust05_svm(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust05 --output {self.tmp}/robust05_svm.txt --classifier svm ') @@ -564,8 +565,7 @@ def test_robust05_svm(self): self.assertAlmostEqual(score, 0.2454, delta=0.0001) def test_robust05_svm_rm3(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust05 --output {self.tmp}/robust05_svm_rm3.txt --classifier svm -rm3') @@ -575,17 +575,12 @@ def test_robust05_svm_rm3(self): status = os.system(cmd1) stdout, stderr = run_command(cmd1) score = parse_score(stdout, "map") - os.remove(f'{self.pyserini_root}/integrations/core17.tar.gz') - os.remove(f'{self.pyserini_root}/integrations/core18.tar.gz') - os.remove(f'{self.pyserini_root}/integrations/robust04.tar.gz') - os.remove(f'{self.pyserini_root}/integrations/robust05.tar.gz') self.assertEqual(status, 0) self.assertEqual(stderr, '') self.assertAlmostEqual(score, 0.2850, delta=0.0001) def test_robust05_avg(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust05 --output {self.tmp}/robust05_lr+svm.txt --classifier lr+svm ') @@ -601,8 +596,7 @@ def test_robust05_avg(self): self.assertAlmostEqual(score, 0.2485, delta=0.0001) def test_robust05_avg_rm3(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust05 --output {self.tmp}/robust05_lr+svm_rm3.txt --classifier lr+svm -rm3') @@ -618,8 +612,7 @@ def test_robust05_avg_rm3(self): self.assertAlmostEqual(score, 0.2860, delta=0.0001) def test_robust05_rrf(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust05 --output {self.tmp}/robust05_rrf.txt --classifier rrf') @@ -635,8 +628,7 @@ def test_robust05_rrf(self): self.assertAlmostEqual(score, 0.2490, delta=0.0001) def test_robust05_rrf_rm3(self): - os.system( - f'python3 {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py --anserini {self.anserini_root} \ --run_file {self.tmp} --pyserini {self.pyserini_root} \ --collection robust05 --output {self.tmp}/robust05_rrf_rm3.txt --classifier rrf -rm3') From 077a21b19262a9c3264ba8f9b80017cb1aa313ca Mon Sep 17 00:00:00 2001 From: Yuqi Date: Wed, 21 Apr 2021 22:32:20 -0400 Subject: [PATCH 18/20] update python3 to python --- integrations/simplesearcher_checker.py | 2 +- integrations/simplesearcher_score_checker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/simplesearcher_checker.py b/integrations/simplesearcher_checker.py index 5b591177d..9f6f27d30 100644 --- a/integrations/simplesearcher_checker.py +++ b/integrations/simplesearcher_checker.py @@ -29,7 +29,7 @@ def __init__(self, anserini_root: str, index: str, topics: str, pyserini_topics: self.anserini_base_cmd = os.path.join(self.anserini_root, 'target/appassembler/bin/SearchCollection -topicreader Trec') - self.pyserini_base_cmd = 'python3 -m pyserini.search' + self.pyserini_base_cmd = 'python -m pyserini.search' self.eval_base_cmd = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30' diff --git a/integrations/simplesearcher_score_checker.py b/integrations/simplesearcher_score_checker.py index 8c20a5039..6aed096c8 100644 --- a/integrations/simplesearcher_score_checker.py +++ b/integrations/simplesearcher_score_checker.py @@ -27,7 +27,7 @@ def __init__(self, index: str, topics: str, pyserini_topics: str, qrels: str, ev self.qrels = qrels self.pyserini_topics = pyserini_topics - self.pyserini_base_cmd = 'python3 -m pyserini.search' + self.pyserini_base_cmd = 'python -m pyserini.search' self.eval_base_cmd = eval From ed30f2a44d00a486b6ac5ad1471ccc402dd4af01 Mon Sep 17 00:00:00 2001 From: Yuqi Date: Tue, 27 Apr 2021 10:22:36 -0400 Subject: [PATCH 19/20] add prf test for cross validation, run file generation and score check --- integrations/test_simplesearcher_check_prf.py | 965 +++++++++++------- 1 file changed, 614 insertions(+), 351 deletions(-) diff --git a/integrations/test_simplesearcher_check_prf.py b/integrations/test_simplesearcher_check_prf.py index 21f19bc6b..1a4f32a4e 100644 --- a/integrations/test_simplesearcher_check_prf.py +++ b/integrations/test_simplesearcher_check_prf.py @@ -41,39 +41,32 @@ def setUp(self): else: os.mkdir(self.tmp) + self.pyserini_search_cmd = 'python -m pyserini.search' + self.core17_index_path = os.path.join(self.anserini_root, 'indexes/lucene-index.core17.pos+docvectors+raw') + self.core17_qrels_path = os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core17.txt') + + self.core18_index_path = os.path.join(self.anserini_root, 'indexes/lucene-index.core18.pos+docvectors+raw') + self.core18_qrels_path = os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt') + + self.robust04_index_path = os.path.join(self.anserini_root, 'indexes/lucene-index.robust04.pos+docvectors+raw') + self.robust04_qrels_path = os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust04.txt') + + self.robust05_index_path = os.path.join(self.anserini_root, 'indexes/lucene-index.robust05.pos+docvectors+raw') + self.robust05_qrels_path = os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust05.txt') + self.core17_checker = SimpleSearcherScoreChecker( - index=os.path.join(self.anserini_root, 'indexes/lucene-index.core17.pos+docvectors+raw'), + index=self.core17_index_path, topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.core17.txt'), pyserini_topics='core17', - qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core17.txt'), + qrels=self.core17_qrels_path, eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') - try: - if os.path.exists(f'{self.tmp}/core17') == False: - tar = tarfile.open(f"{self.pyserini_root}/integrations/core17.tar.gz", "r:gz") - tar.extractall(path=f'{self.tmp}') - tar.close() - except: - shutil.rmtree(f'{self.tmp}') - print(f'core17.tar.gz is not saved in {self.pyserini_root}/integrations') - raise - self.core18_checker = SimpleSearcherScoreChecker( - index=os.path.join(self.anserini_root, 'indexes/lucene-index.core18.pos+docvectors+raw'), - topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.core18.txt'), - pyserini_topics='core18', - qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt'), - eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') - - try: - if os.path.exists(f'{self.tmp}/core18') == False: - tar = tarfile.open(f"{self.pyserini_root}/integrations/core18.tar.gz", "r:gz") - tar.extractall(path=f'{self.tmp}') - tar.close() - except: - shutil.rmtree(f'{self.tmp}') - print(f'core18.tar.gz is not saved in {self.pyserini_root}/integrations') - raise + index=os.path.join(self.anserini_root, 'indexes/lucene-index.core18.pos+docvectors+raw'), + topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.core18.txt'), + pyserini_topics='core18', + qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt'), + eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') self.robust04_checker = SimpleSearcherScoreChecker( index=os.path.join(self.anserini_root, 'indexes/lucene-index.robust04.pos+docvectors+raw'), @@ -82,16 +75,6 @@ def setUp(self): qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust04.txt'), eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') - try: - if os.path.exists(f'{self.tmp}/robust04') == False: - tar = tarfile.open(f"{self.pyserini_root}/integrations/robust04.tar.gz", "r:gz") - tar.extractall(path=f'{self.tmp}') - tar.close() - except: - shutil.rmtree(f'{self.tmp}') - print(f'robust04.tar.gz is not saved in {self.pyserini_root}/integrations') - raise - self.robust05_checker = SimpleSearcherScoreChecker( index=os.path.join(self.anserini_root, 'indexes/lucene-index.robust05.pos+docvectors+raw'), topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.robust05.txt'), @@ -99,15 +82,30 @@ def setUp(self): qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust05.txt'), eval=f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30') - try: - if os.path.exists(f'{self.tmp}/robust05') == False: - tar = tarfile.open(f"{self.pyserini_root}/integrations/robust05.tar.gz", "r:gz") - tar.extractall(path=f'{self.tmp}') - tar.close() - except: - shutil.rmtree(f'{self.tmp}') - print(f'robust05.tar.gz is not saved in {self.pyserini_root}/integrations') - raise + def test_cross_validation(self): + pyserini_topics = 'core17' + for alpha in [x / 10.0 for x in range(0, 11)]: + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core17/core17_lr_A{alpha}_bm25.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha {alpha}' + + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ + --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ + --collection core17 --output {self.tmp}/core17_lr.txt --classifier lr ') + + cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_lr.txt' + + status = os.system(cmd) + stdout, stderr = run_command(cmd) + score = parse_score(stdout, "map") + + self.assertEqual(status, 0) + self.assertEqual(stderr, '') + self.assertAlmostEqual(score, 0.2462, delta=0.0001) def test_core17(self): self.assertTrue(self.core17_checker.run('core17_bm25', '--bm25', 0.2087)) @@ -116,139 +114,206 @@ def test_core17_rm3(self): self.assertTrue(self.core17_checker.run('core17_bm25', '--bm25 --rm3', 0.2823)) def test_core17_lr(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core17 --output {self.tmp}/core17_lr.txt --classifier lr ') + pyserini_topics = 'core17' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt {self.tmp}/core17_lr.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core17_lr.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7' - status = os.system(cmd) - stdout, stderr = run_command(cmd) + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_lr.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2462, delta=0.0001) + self.assertAlmostEqual(score, 0.2473, delta=0.0001) def test_core17_lr_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core17 --output {self.tmp}/core17_lr_rm3.txt --classifier lr -rm3') + pyserini_topics = 'core17' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ - {self.tmp}/core17_lr_rm3.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core17_lr_rm3.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3' - status = os.system(cmd) - stdout, stderr = run_command(cmd) + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_lr_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2906, delta=0.0001) + self.assertAlmostEqual(score, 0.2940, delta=0.0001) def test_core17_svm(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core17 --output {self.tmp}/core17_svm.txt --classifier svm') + pyserini_topics = 'core17' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ - {self.tmp}/core17_svm.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core17_svm.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7' - status = os.system(cmd) - stdout, stderr = run_command(cmd) + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_svm.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2367, delta=0.0001) + self.assertAlmostEqual(score, 0.2385, delta=0.0001) def test_core17_svm_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core17 --output {self.tmp}/core17_svm_rm3.txt --classifier svm -rm3') + pyserini_topics = 'core17' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ - {self.tmp}/core17_svm_rm3.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core17_svm_rm3.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3' - status = os.system(cmd) - stdout, stderr = run_command(cmd) + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_svm_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2940, delta=0.0001) + self.assertAlmostEqual(score, 0.2970, delta=0.0001) def test_core17_avg(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core17 --output {self.tmp}/core17_lr+svm.txt --classifier lr+svm ') + pyserini_topics = 'core17' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ - {self.tmp}/core17_lr+svm.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core17_avg.txt ' \ + + f'--prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6' - status = os.system(cmd) - stdout, stderr = run_command(cmd) + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_avg.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2426, delta=0.0001) + self.assertAlmostEqual(score, 0.2442, delta=0.0001) def test_core17_avg_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core17 --output {self.tmp}/core17_lr+svm_rm3.txt --classifier lr+svm -rm3') + pyserini_topics = 'core17' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ - {self.tmp}/core17_lr+svm_rm3.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core17_avg_rm3.txt ' \ + + f'--prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3' - status = os.system(cmd) - stdout, stderr = run_command(cmd) + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_avg_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2952, delta=0.0001) + self.assertAlmostEqual(score, 0.2967, delta=0.0001) def test_core17_rrf(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core17 --output {self.tmp}/core17_rrf.txt --classifier rrf') + pyserini_topics = 'core17' + lr_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core17_lr.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ - {self.tmp}/core17_rrf.txt' + status = os.system(lr_cmd) + self.assertEqual(status, 0) - status = os.system(cmd) - stdout, stderr = run_command(cmd) + svm_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core17_svm.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7' + + status = os.system(svm_cmd) + self.assertEqual(status, 0) + + rrf_cmd = f'python {self.anserini_root}/src/main/python/fusion.py ' \ + + f'--runs {self.tmp}/core17_lr.txt {self.tmp}/core17_svm.txt ' \ + + f'--out {self.tmp}/core17_rrf.txt' + + status = os.system(rrf_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_rrf.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2433, delta=0.0001) + self.assertAlmostEqual(score, 0.2446, delta=0.0001) def test_core17_rrf_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core17 --output {self.tmp}/core17_rrf_rm3.txt --classifier rrf -rm3') + pyserini_topics = 'core17' + lr_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core17_lr_rm3.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ - {self.tmp}/core17_rrf_rm3.txt' + status = os.system(lr_cmd) + self.assertEqual(status, 0) - status = os.system(cmd) - stdout, stderr = run_command(cmd) + svm_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core17_svm_rm3.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3' + + status = os.system(svm_cmd) + self.assertEqual(status, 0) + + rrf_cmd = f'python {self.anserini_root}/src/main/python/fusion.py ' \ + + f'--runs {self.tmp}/core17_lr_rm3.txt {self.tmp}/core17_svm_rm3.txt ' \ + + f'--out {self.tmp}/core17_rrf_rm3.txt' + + status = os.system(rrf_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ + {self.tmp}/core17_rrf_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2951, delta=0.0001) + self.assertAlmostEqual(score, 0.2965, delta=0.0001) def test_core18(self): self.assertTrue(self.core18_checker.run('core18_bm25', '--bm25', 0.2495)) @@ -257,140 +322,206 @@ def test_core18_rm3(self): self.assertTrue(self.core18_checker.run('core18_bm25', '--bm25 --rm3', 0.3135)) def test_core18_lr(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core18 --output {self.tmp}/core18_lr.txt --classifier lr') + pyserini_topics = 'core18' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core18_lr.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6' + + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ {self.tmp}/core18_lr.txt' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') self.assertAlmostEqual(score, 0.2837, delta=0.0001) def test_core18_lr_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core18 --output {self.tmp}/core18_lr_rm3.txt --classifier lr -rm3') + pyserini_topics = 'core18' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ - {self.tmp}/core18_lr_rm3.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core18_lr_rm3.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_lr_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.3195, delta=0.0001) + self.assertAlmostEqual(score, 0.3222, delta=0.0001) def test_core18_svm(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core18 --output {self.tmp}/core18_svm.txt --classifier svm ') + pyserini_topics = 'core18' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ - {self.tmp}/core18_svm.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core18_svm.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_svm.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2786, delta=0.0001) + self.assertAlmostEqual(score, 0.2840, delta=0.0001) def test_core18_svm_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core18 --output {self.tmp}/core18_svm_rm3.txt --classifier svm -rm3') + pyserini_topics = 'core18' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ - {self.tmp}/core18_svm_rm3.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core18_svm_rm3.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_svm_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.3220, delta=0.0001) + self.assertAlmostEqual(score, 0.3219, delta=0.0001) def test_core18_avg(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core18 --output {self.tmp}/core18_lr+svm.txt --classifier lr+svm') + pyserini_topics = 'core18' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ - {self.tmp}/core18_lr+svm.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core18_avg.txt ' \ + + f'--prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_avg.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2821, delta=0.0001) + self.assertAlmostEqual(score, 0.2860, delta=0.0001) def test_core18_avg_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core18 --output {self.tmp}/core18_lr+svm_rm3.txt --classifier lr+svm -rm3') + pyserini_topics = 'core18' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ - {self.tmp}/core18_lr+svm_rm3.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core18_avg_rm3.txt ' \ + + f'--prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_avg_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.3200, delta=0.0001) + self.assertAlmostEqual(score, 0.3227, delta=0.0001) def test_core18_rrf(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core18 --output {self.tmp}/core18_rrf.txt --classifier rrf') + pyserini_topics = 'core18' + lr_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core18_lr.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ - {self.tmp}/core18_rrf.txt' + status = os.system(lr_cmd) + self.assertEqual(status, 0) - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + svm_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core18_svm.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6' + + status = os.system(svm_cmd) + self.assertEqual(status, 0) + + rrf_cmd = f'python {self.anserini_root}/src/main/python/fusion.py ' \ + + f'--runs {self.tmp}/core18_lr.txt {self.tmp}/core18_svm.txt ' \ + + f'--out {self.tmp}/core18_rrf.txt' + + status = os.system(rrf_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_rrf.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2871, delta=0.0001) + self.assertAlmostEqual(score, 0.2881, delta=0.0001) def test_core18_rrf_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection core18 --output {self.tmp}/core18_rrf_rm3.txt --classifier rrf -rm3') + pyserini_topics = 'core18' + lr_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core18_lr_rm3.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ - {self.tmp}/core18_rrf_rm3.txt' + status = os.system(lr_cmd) + self.assertEqual(status, 0) - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + svm_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/core18_svm_rm3.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3' + + status = os.system(svm_cmd) + self.assertEqual(status, 0) + + rrf_cmd = f'python {self.anserini_root}/src/main/python/fusion.py ' \ + + f'--runs {self.tmp}/core18_lr_rm3.txt {self.tmp}/core18_svm_rm3.txt ' \ + + f'--out {self.tmp}/core18_rrf_rm3.txt' + + status = os.system(rrf_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \ + {self.tmp}/core18_rrf_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.3204, delta=0.0001) + self.assertAlmostEqual(score, 0.3214, delta=0.0001) def test_robust04(self): self.assertTrue(self.robust04_checker.run('robust04_bm25', '--bm25', 0.2531)) @@ -399,136 +530,202 @@ def test_robust04_rm3(self): self.assertTrue(self.robust04_checker.run('robust04_bm25_rm3', '--bm25 --rm3', 0.2903)) def test_robust04_lr(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust04 --output {self.tmp}/robust04_lr.txt --classifier lr ') + pyserini_topics = 'robust04' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ - {self.tmp}/robust04_lr.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust04_lr.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_lr.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2721, delta=0.0001) + self.assertAlmostEqual(score, 0.2747, delta=0.0001) def test_robust04_lr_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust04 --output {self.tmp}/robust04_lr_rm3.txt --classifier lr -rm3') + pyserini_topics = 'robust04' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ - {self.tmp}/robust04_lr_rm3.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust04_lr_rm3.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_lr_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2961, delta=0.0001) + self.assertAlmostEqual(score, 0.2971, delta=0.0001) def test_robust04_svm(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust04 --output {self.tmp}/robust04_svm.txt --classifier svm ') + pyserini_topics = 'robust04' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ - {self.tmp}/robust04_svm.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust04_svm.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_svm.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2716, delta=0.0001) + self.assertAlmostEqual(score, 0.2726, delta=0.0001) def test_robust04_svm_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust04 --output {self.tmp}/robust04_svm_rm3.txt --classifier svm -rm3') + pyserini_topics = 'robust04' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ - {self.tmp}/robust04_svm_rm3.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust04_svm_rm3.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_svm_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2963, delta=0.0001) + self.assertAlmostEqual(score, 0.2967, delta=0.0001) def test_robust04_avg(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust04 --output {self.tmp}/robust04_lr+svm.txt --classifier lr+svm') + pyserini_topics = 'robust04' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ - {self.tmp}/robust04_lr+svm.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust04_avg.txt ' \ + + f'--prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_avg.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2745, delta=0.0001) + self.assertAlmostEqual(score, 0.276, delta=0.0001) def test_robust04_avg_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust04 --output {self.tmp}/robust04_lr+svm_rm3.txt --classifier lr+svm -rm3') + pyserini_topics = 'robust04' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ - {self.tmp}/robust04_lr+svm_rm3.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust04_avg_rm3.txt ' \ + + f'--prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_avg_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') self.assertAlmostEqual(score, 0.2980, delta=0.0001) def test_robust04_rrf(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust04 --output {self.tmp}/robust04_rrf.txt --classifier rrf') + pyserini_topics = 'robust04' + lr_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust04_lr.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ - {self.tmp}/robust04_rrf.txt' + status = os.system(lr_cmd) + self.assertEqual(status, 0) - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + svm_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust04_svm.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5' + + status = os.system(svm_cmd) + self.assertEqual(status, 0) + + rrf_cmd = f'python {self.anserini_root}/src/main/python/fusion.py ' \ + + f'--runs {self.tmp}/robust04_lr.txt {self.tmp}/robust04_svm.txt ' \ + + f'--out {self.tmp}/robust04_rrf.txt' + + status = os.system(rrf_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_rrf.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2740, delta=0.0001) + self.assertAlmostEqual(score, 0.275, delta=0.0001) def test_robust04_rrf_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust04 --output {self.tmp}/robust04_rrf_rm3.txt --classifier rrf -rm3') + pyserini_topics = 'robust04' + lr_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust04_lr_rm3.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ - {self.tmp}/robust04_rrf_rm3.txt' + status = os.system(lr_cmd) + self.assertEqual(status, 0) - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + svm_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust04_svm_rm3.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3' + + status = os.system(svm_cmd) + self.assertEqual(status, 0) + + rrf_cmd = f'python {self.anserini_root}/src/main/python/fusion.py ' \ + + f'--runs {self.tmp}/robust04_lr_rm3.txt {self.tmp}/robust04_svm_rm3.txt ' \ + + f'--out {self.tmp}/robust04_rrf_rm3.txt' + + status = os.system(rrf_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \ + {self.tmp}/robust04_rrf_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') @@ -541,144 +738,210 @@ def test_robust05_rm3(self): self.assertTrue(self.robust05_checker.run('robust05_bm25_rm3', '--bm25 --rm3', 0.2602)) def test_robust05_lr(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust05 --output {self.tmp}/robust05_lr.txt --classifier lr ') + pyserini_topics = 'robust05' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ - {self.tmp}/robust05_lr.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust05_lr.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.8' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_lr.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') self.assertAlmostEqual(score, 0.2476, delta=0.0001) def test_robust05_lr_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust05 --output {self.tmp}/robust05_lr_rm3.txt --classifier lr -rm3') + pyserini_topics = 'robust05' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ - {self.tmp}/robust05_lr_rm3.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust05_lr_rm3.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6 --rm3' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_lr_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2845, delta=0.0001) + self.assertAlmostEqual(score, 0.2854, delta=0.0001) def test_robust05_svm(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust05 --output {self.tmp}/robust05_svm.txt --classifier svm ') + pyserini_topics = 'robust05' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ - {self.tmp}/robust05_svm.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust05_svm.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.8' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_svm.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2454, delta=0.0001) + self.assertAlmostEqual(score, 0.2486, delta=0.0001) def test_robust05_svm_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust05 --output {self.tmp}/robust05_svm_rm3.txt --classifier svm -rm3') + pyserini_topics = 'robust05' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ - {self.tmp}/robust05_svm_rm3.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust05_svm_rm3.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6 --rm3' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_svm_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2850, delta=0.0001) + self.assertAlmostEqual(score, 0.2855, delta=0.0001) def test_robust05_avg(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust05 --output {self.tmp}/robust05_lr+svm.txt --classifier lr+svm ') + pyserini_topics = 'robust05' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ - {self.tmp}/robust05_lr+svm.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust05_avg.txt ' \ + + f'--prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.8' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_avg.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') self.assertAlmostEqual(score, 0.2485, delta=0.0001) def test_robust05_avg_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust05 --output {self.tmp}/robust05_lr+svm_rm3.txt --classifier lr+svm -rm3') + pyserini_topics = 'robust05' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ - {self.tmp}/robust05_lr+svm_rm3.txt' + run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust05_avg_rm3.txt ' \ + + f'--prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6 --rm3' - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + status = os.system(run_file_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_avg_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2860, delta=0.0001) + self.assertAlmostEqual(score, 0.2865, delta=0.0001) def test_robust05_rrf(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust05 --output {self.tmp}/robust05_rrf.txt --classifier rrf') + pyserini_topics = 'robust05' + lr_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust05_lr.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ - {self.tmp}/robust05_rrf.txt' + status = os.system(lr_cmd) + self.assertEqual(status, 0) - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + svm_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust05_svm.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5' + + status = os.system(svm_cmd) + self.assertEqual(status, 0) + + rrf_cmd = f'python {self.anserini_root}/src/main/python/fusion.py ' \ + + f'--runs {self.tmp}/robust05_lr.txt {self.tmp}/robust05_svm.txt ' \ + + f'--out {self.tmp}/robust05_rrf.txt' + + status = os.system(rrf_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_rrf.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2490, delta=0.0001) + self.assertAlmostEqual(score, 0.2401, delta=0.0001) def test_robust05_rrf_rm3(self): - os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \ - --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \ - --collection robust05 --output {self.tmp}/robust05_rrf_rm3.txt --classifier rrf -rm3') + pyserini_topics = 'robust05' + lr_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust05_lr_rm3.txt ' \ + + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3' - cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ - {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ - {self.tmp}/robust05_rrf_rm3.txt' + status = os.system(lr_cmd) + self.assertEqual(status, 0) - status = os.system(cmd) - stdout, stderr = run_command(cmd) - score = parse_score(stdout, 'map') + svm_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} ' \ + + f'--topics {pyserini_topics} --output {self.tmp}/robust05_svm_rm3.txt ' \ + + f'--prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3' + + status = os.system(svm_cmd) + self.assertEqual(status, 0) + + rrf_cmd = f'python {self.anserini_root}/src/main/python/fusion.py ' \ + + f'--runs {self.tmp}/robust05_lr_rm3.txt {self.tmp}/robust05_svm_rm3.txt ' \ + + f'--out {self.tmp}/robust05_rrf_rm3.txt' + + status = os.system(rrf_cmd) + self.assertEqual(status, 0) + + score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ + {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \ + {self.tmp}/robust05_rrf_rm3.txt' + + status = os.system(score_cmd) + stdout, stderr = run_command(score_cmd) + score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') - self.assertAlmostEqual(score, 0.2870, delta=0.0001) + self.assertAlmostEqual(score, 0.2788, delta=0.0001) def tearDown(self): shutil.rmtree(f'{self.tmp}') if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() From 63b49b04e7778abc994c9df46acfee6075c6d02b Mon Sep 17 00:00:00 2001 From: Yuqi Date: Wed, 28 Apr 2021 11:32:59 -0400 Subject: [PATCH 20/20] fix a bug by adding mkdir command --- integrations/test_simplesearcher_check_prf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/test_simplesearcher_check_prf.py b/integrations/test_simplesearcher_check_prf.py index 1a4f32a4e..12e3c36b1 100644 --- a/integrations/test_simplesearcher_check_prf.py +++ b/integrations/test_simplesearcher_check_prf.py @@ -84,6 +84,7 @@ def setUp(self): def test_cross_validation(self): pyserini_topics = 'core17' + os.mkdir(f'{self.tmp}/core17') for alpha in [x / 10.0 for x in range(0, 11)]: run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + f'--topics {pyserini_topics} --output {self.tmp}/core17/core17_lr_A{alpha}_bm25.txt ' \