Skip to content

Commit

Permalink
Improve cache filenames
Browse files Browse the repository at this point in the history
The new scheme prevents multiple urls ending in index.html from
clashing, and ensures an extension is always present.
  • Loading branch information
newsch committed Apr 15, 2022
1 parent 4f0eb46 commit 0e98fe6
Showing 1 changed file with 43 additions and 1 deletion.
44 changes: 43 additions & 1 deletion focstest.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,48 @@ def infer_url(filepath):
return url


def get_cache_filename(url: str) -> str:
"""Get a filesystem-safe filename based on a url
>>> get_cache_filename('http://foo.bar/baz/qux/')
'foo_bar_baz_qux.html'
normalizes protocol
>>> get_cache_filename('http://foo.bar/baz/qux') == get_cache_filename('https://foo.bar/baz/qux')
True
normalizes trailing slashes
>>> get_cache_filename('http://foo.bar/baz/qux') == get_cache_filename('http://foo.bar/baz/qux/')
True
normalizes directory names and index.html
>>> get_cache_filename('http://foo.bar/baz/qux') == get_cache_filename('http://foo.bar/baz/qux/index.html')
True
>>> get_cache_filename('http://foo.bar/baz/qux/') == get_cache_filename('http://foo.bar/baz/qux/index.html')
True
two different homeworks end up with different filenames
>>> hw1 = infer_url('homework1.ml'); hw2 = infer_url('homework2.ml')
>>> get_cache_filename(hw1) != get_cache_filename(hw2)
True
"""
BAD_CHARS = {'\0', '\\', '/', ':', '*', '?', '"', '>', '<', '|', ':'}

parse_result = urllib.parse.urlparse(url)
# if Riccardo switches to php and the urls are query-encoded like 'homework.php?id=9', this will need to be updated
filename = parse_result.netloc.replace('.', '_') + parse_result.path
# normalize trailing / and /index.html
filename = filename.rstrip('/')
if filename.endswith('/index.html'):
filename = filename[:-len('/index.html')]
filename = ''.join(c if c not in BAD_CHARS else '_' for c in filename)
# default to .html extension
path, ext = os.path.splitext(filename)
if ext == '':
filename += '.html'
return filename


def main():
parser = argparse.ArgumentParser(
description='Run ocaml "doctests".',
Expand Down Expand Up @@ -244,7 +286,7 @@ def main():
if not os.path.exists(CACHE_DIR):
os.makedirs(CACHE_DIR)
logger.info('Created cache directory at {!r}'.format(CACHE_DIR))
page_name = os.path.basename(urllib.parse.urlparse(URL).path) # get page name from url
page_name = get_cache_filename(URL)
html_filepath = os.path.join(CACHE_DIR, page_name) # local filepath

# get webpage if cached version doesn't already exist
Expand Down

0 comments on commit 0e98fe6

Please sign in to comment.