From 939e775c2792aa5fa2eff3d49f9417755f1b880d Mon Sep 17 00:00:00 2001
From: nanos <me@nanos.bz>
Date: Thu, 15 Aug 2024 08:00:00 +0100
Subject: [PATCH] Try to use xxHash to hash robots cache file names

This should be faster and more efficient.
---
 find_posts.py    | 4 ++--
 requirements.txt | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/find_posts.py b/find_posts.py
index 408f0f5b..9851820d 100644
--- a/find_posts.py
+++ b/find_posts.py
@@ -16,7 +16,7 @@
 import defusedxml.ElementTree as ET
 import urllib.robotparser
 from urllib.parse import urlparse
-import hashlib
+import xxhash
 
 logger = logging.getLogger("FediFetcher")
 robotParser = urllib.robotparser.RobotFileParser()
@@ -1076,7 +1076,7 @@ def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5):
     return result
 
 def get_robots_txt_cache_path(robots_url):
-    hash = hashlib.sha256(robots_url.encode('utf-8'))
+    hash = xxhash.xxh128(robots_url.encode('utf-8'))
     return os.path.join(arguments.state_dir, f'robots-{hash.hexdigest()}.txt')
 
 def get_cached_robots(robots_url):
diff --git a/requirements.txt b/requirements.txt
index dbfd30b5..a3fb88d6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,4 @@ requests==2.32.0
 six==1.16.0
 smmap==5.0.0
 urllib3==1.26.19
+xxhash==3.4.1
\ No newline at end of file