From 939e775c2792aa5fa2eff3d49f9417755f1b880d Mon Sep 17 00:00:00 2001 From: nanos Date: Thu, 15 Aug 2024 08:00:00 +0100 Subject: [PATCH] Try to use xxHash to hash robots cache file names This should be faster and more efficient. --- find_posts.py | 4 ++-- requirements.txt | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/find_posts.py b/find_posts.py index 408f0f5b..9851820d 100644 --- a/find_posts.py +++ b/find_posts.py @@ -16,7 +16,7 @@ import defusedxml.ElementTree as ET import urllib.robotparser from urllib.parse import urlparse -import hashlib +import xxhash logger = logging.getLogger("FediFetcher") robotParser = urllib.robotparser.RobotFileParser() @@ -1076,7 +1076,7 @@ def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5): return result def get_robots_txt_cache_path(robots_url): - hash = hashlib.sha256(robots_url.encode('utf-8')) + hash = xxhash.xxh128(robots_url.encode('utf-8')) return os.path.join(arguments.state_dir, f'robots-{hash.hexdigest()}.txt') def get_cached_robots(robots_url): diff --git a/requirements.txt b/requirements.txt index dbfd30b5..a3fb88d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ requests==2.32.0 six==1.16.0 smmap==5.0.0 urllib3==1.26.19 +xxhash==3.4.1 \ No newline at end of file