Improve cache filenames

The new scheme prevents multiple urls ending in index.html from clashing, and ensures an extension is always present.
olin · Apr 15, 2022 · 0e98fe6 · 0e98fe6
1 parent 4f0eb46
commit 0e98fe6
Showing 1 changed file with 43 additions and 1 deletion.
diff --git a/focstest.py b/focstest.py
@@ -192,6 +192,48 @@ def infer_url(filepath):
     return url
 
 
+def get_cache_filename(url: str) -> str:
+    """Get a filesystem-safe filename based on a url
+
+    >>> get_cache_filename('http://foo.bar/baz/qux/')
+    'foo_bar_baz_qux.html'
+
+    normalizes protocol
+    >>> get_cache_filename('http://foo.bar/baz/qux') == get_cache_filename('https://foo.bar/baz/qux')
+    True
+
+    normalizes trailing slashes
+    >>> get_cache_filename('http://foo.bar/baz/qux') == get_cache_filename('http://foo.bar/baz/qux/')
+    True
+
+    normalizes directory names and index.html
+    >>> get_cache_filename('http://foo.bar/baz/qux') == get_cache_filename('http://foo.bar/baz/qux/index.html')
+    True
+    >>> get_cache_filename('http://foo.bar/baz/qux/') == get_cache_filename('http://foo.bar/baz/qux/index.html')
+    True
+
+    two different homeworks end up with different filenames
+    >>> hw1 = infer_url('homework1.ml'); hw2 = infer_url('homework2.ml')
+    >>> get_cache_filename(hw1) != get_cache_filename(hw2)
+    True
+    """
+    BAD_CHARS = {'\0', '\\', '/', ':', '*', '?', '"', '>', '<', '|', ':'}
+
+    parse_result = urllib.parse.urlparse(url)
+    # if Riccardo switches to php and the urls are query-encoded like 'homework.php?id=9', this will need to be updated
+    filename = parse_result.netloc.replace('.', '_') + parse_result.path
+    # normalize trailing / and /index.html
+    filename = filename.rstrip('/')
+    if filename.endswith('/index.html'):
+        filename = filename[:-len('/index.html')]
+    filename = ''.join(c if c not in BAD_CHARS else '_' for c in filename)
+    # default to .html extension
+    path, ext = os.path.splitext(filename)
+    if ext == '':
+        filename += '.html'
+    return filename
+
+
 def main():
     parser = argparse.ArgumentParser(
         description='Run ocaml "doctests".',
@@ -244,7 +286,7 @@ def main():
     if not os.path.exists(CACHE_DIR):
         os.makedirs(CACHE_DIR)
         logger.info('Created cache directory at {!r}'.format(CACHE_DIR))
-    page_name = os.path.basename(urllib.parse.urlparse(URL).path)  # get page name from url
+    page_name = get_cache_filename(URL)
     html_filepath = os.path.join(CACHE_DIR, page_name)  # local filepath
 
     # get webpage if cached version doesn't already exist