nanos · nanos · Jul 2, 2024 · Jul 1, 2024
diff --git a/find_posts.py b/find_posts.py
@@ -16,6 +16,7 @@
 import defusedxml.ElementTree as ET
 import urllib.robotparser
 from urllib.parse import urlparse
+import hashlib
 
 logger = logging.getLogger("FediFetcher")
 robotParser = urllib.robotparser.RobotFileParser()
@@ -1011,41 +1012,54 @@ def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5):
                 break
     return result
 
-def can_fetch(user_agent, url):
-    parsed_uri = urlparse(url)
-    robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
+def get_robots_txt_cache_path(robots_url):
+    hash = hashlib.sha256(robots_url.encode('utf-8'))
+    return os.path.join(arguments.state_dir, f'robots-{hash.hexdigest()}.txt')
 
-    if robots in ROBOTS_TXT:
-        if isinstance(ROBOTS_TXT[robots], bool):
-            return ROBOTS_TXT[robots]
+def get_cached_robots(robots_url):
+    ## firstly: check the in-memory cache
+    if robots_url in ROBOTS_TXT:
+        return ROBOTS_TXT[robots_url]
+
+    robotsCachePath = get_robots_txt_cache_path(robots_url)
+    if os.path.exists(robotsCachePath):
+        with open(robotsCachePath, "r", encoding="utf-8") as f:
+            logger.debug(f"Getting robots.txt file from cache for {robots_url}.")
+            robotsTxt = f.read()
+            ROBOTS_TXT[robots_url] = robotsTxt
+            return robotsTxt
+
+    return None
+
+def get_robots_from_url(robots_url):
+    robotsTxt = get_cached_robots(robots_url)
+    if robotsTxt != None:
+        return robotsTxt
+
+    try:
+        # We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
+        robotsTxt = get(robots_url, timeout = 2, ignore_robots_txt=True)
+        if robotsTxt.status_code in (401, 403):
+            robotsTxt = False
         else:
-            robotsTxt = ROBOTS_TXT[robots]
-    else:
-        robotsCachePath = os.path.join(arguments.state_dir, f'robots-{parsed_uri.netloc}')
-        if os.path.exists(robotsCachePath):
-            with open(robotsCachePath, "r", encoding="utf-8") as f:
-                logger.debug(f"Getting robots.txt file from cache {parsed_uri.netloc}")
-                robotsTxt = f.read()
-            ROBOTS_TXT[robots] = robotsTxt
+            robotsTxt = robotsTxt.text
+            with open(get_robots_txt_cache_path(robots_url), "w", encoding="utf-8") as f:
+                f.write(robotsTxt)
 
-        else:
-            try:
-                # We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
-                robotsTxt = get(robots, timeout = 2, ignore_robots_txt=True)
-                if robotsTxt.status_code in (401, 403):
-                    ROBOTS_TXT[robots] = False
-                    return False
-                elif robotsTxt.status_code != 200:
-                    ROBOTS_TXT[robots] = True
-                    return True
-                robotsTxt = robotsTxt.text
-                ROBOTS_TXT[robots] = robotsTxt
-
-                with open(robotsCachePath, "w", encoding="utf-8") as f:
-                    f.write(robotsTxt)
+    except Exception as ex:
+        robotsTxt = True
 
-            except Exception as ex:
-                return True
+    ROBOTS_TXT[robots_url] = robotsTxt
+    return robotsTxt
+
+
+def can_fetch(user_agent, url):
+    parsed_uri = urlparse(url)
+    robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
+
+    robotsTxt = get_robots_from_url(robots_url)
+    if isinstance(robotsTxt, bool):
+        return robotsTxt
 
     robotParser = urllib.robotparser.RobotFileParser()
     robotParser.parse(robotsTxt.splitlines())