Skip to content

Commit

Permalink
Merge pull request #23 from webis-de/serp-tests
Browse files Browse the repository at this point in the history
Add SERP parsing tests
  • Loading branch information
janheinrichmerker authored Nov 1, 2023
2 parents 9e51fa5 + 8f79086 commit 5e66f80
Show file tree
Hide file tree
Showing 815 changed files with 32,151 additions and 68 deletions.
32 changes: 27 additions & 5 deletions archive_query_log/results/test/generate_tests.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict
from datetime import datetime, timezone
from gzip import GzipFile
from io import TextIOWrapper, BytesIO
from json import loads
Expand All @@ -19,8 +20,9 @@
from archive_query_log.model import Service, ArchivedQueryUrl

NUM_SERVICES = 11
SERVICE_NAMES = None
# SERVICE_NAMES = None
SERVICE_NAMES = ["google", "yahoo", "bing", "duckduckgo", "ask", "ecosia"]
# SERVICE_NAMES = ["google"]
NUM_QUERIES_PER_SERVICE = 50

DATA_PATH = Path(
Expand All @@ -38,6 +40,15 @@
PATTERN_SPECIAL_CHARS = compile(r"[^0-9a-z]+")


def warc_url(url: str, timestamp: float) -> str:
wayback_timestamp = datetime \
.fromtimestamp(timestamp, timezone.utc) \
.strftime("%Y%m%d%H%M%S")
wayback_raw_url = \
f"https://web.archive.org/web/{wayback_timestamp}id_/{url}"
return wayback_raw_url


def main():
if SERVICE_NAMES is None:
services: Iterable[Service] = SERVICES.values()
Expand All @@ -52,8 +63,8 @@ def main():

query_urls = defaultdict(list)
for path in tqdm(
list(SAMPLE_QUERIES_PATH.glob("part*.gz")),
desc="Load service queries"
list(SAMPLE_QUERIES_PATH.glob("part*.gz")),
desc="Load service queries"
):
# noinspection PyTypeChecker
with GzipFile(path, "rb") as gf, TextIOWrapper(gf) as f:
Expand All @@ -62,6 +73,11 @@ def main():
continue
if "\"serp_warc_relative_path\": \"" not in line:
continue
if not any(
f"\"search_provider_name\": \"{service_name}" in line
for service_name in service_names
):
continue
query_url = loads(line)
if query_url["search_provider_name"] not in service_names:
continue
Expand Down Expand Up @@ -107,7 +123,10 @@ def main():
url_headers = {
"Archived-URL": schema.dumps(archived_query_url),
}
wayback_raw_url = query_url["serp_wayback_raw_url"]
wayback_raw_url = warc_url(
query_url["serp_url"],
int(query_url["serp_timestamp"]),
)
response = get(
wayback_raw_url,
)
Expand Down Expand Up @@ -171,7 +190,10 @@ def main():
}
with test_path.open("at") as o:
for query_url in query_urls[service_name]:
wayback_raw_url = query_url["serp_wayback_raw_url"]
wayback_raw_url = warc_url(
query_url["serp_url"],
int(query_url["serp_timestamp"]),
)

query = query_url["serp_query_text_url"]
query = slugify(query)
Expand Down
Loading

0 comments on commit 5e66f80

Please sign in to comment.