Skip to content

Commit

Permalink
Merge pull request #614 from jjjake/last-mod-download
Browse files Browse the repository at this point in the history
First pass at using Last-Modified header to set mtime
  • Loading branch information
jjjake authored Mar 28, 2024
2 parents 87bc289 + 6a8f6b5 commit 8c6d9b1
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 45 deletions.
70 changes: 40 additions & 30 deletions internetarchive/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import socket
import sys
from contextlib import nullcontext, suppress
from email.utils import parsedate_to_datetime
from urllib.parse import quote

from requests.exceptions import (
Expand Down Expand Up @@ -218,33 +219,6 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
raise OSError(f'{destdir} is not a directory!')
file_path = os.path.join(destdir, file_path)

if not return_responses and os.path.exists(file_path.encode('utf-8')):
if ignore_existing:
msg = f'skipping {file_path}, file already exists.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif checksum:
with open(file_path, 'rb') as fp:
md5_sum = utils.get_md5(fp)

if md5_sum == self.md5:
msg = f'skipping {file_path}, file already exists based on checksum.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif not fileobj:
st = os.stat(file_path.encode('utf-8'))
if (st.st_mtime == self.mtime) and (st.st_size == self.size) \
or self.name.endswith('_files.xml') and st.st_size != 0:
msg = f'skipping {file_path}, file already exists based on length and date.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return

parent_dir = os.path.dirname(file_path)
try:
if parent_dir != '' and return_responses is not True:
Expand All @@ -255,8 +229,44 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
timeout=timeout,
auth=self.auth,
params=params)

# Get timestamp from Last-Modified header
dt = parsedate_to_datetime(response.headers['Last-Modified'])
last_mod_mtime = dt.timestamp()

response.raise_for_status()
if return_responses:

# Check if we should skip...
if not return_responses and os.path.exists(file_path.encode('utf-8')):
if ignore_existing:
msg = f'skipping {file_path}, file already exists.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif checksum:
with open(file_path, 'rb') as fp:
md5_sum = utils.get_md5(fp)

if md5_sum == self.md5:
msg = f'skipping {file_path}, file already exists based on checksum.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif not fileobj:
st = os.stat(file_path.encode('utf-8'))
if st.st_mtime == last_mod_mtime:
if self.name == f'{self.identifier}_files.xml' \
or (st.st_size == self.size):
msg = (f'skipping {file_path}, file already exists based on '
'length and date.')
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return

elif return_responses:
return response

if verbose:
Expand Down Expand Up @@ -298,11 +308,11 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
else:
raise exc

# Set mtime with mtime from files.xml.
# Set mtime with timestamp from Last-Modified header
if not no_change_timestamp:
# If we want to set the timestamp to that of the original archive...
with suppress(OSError): # Probably file-like object, e.g. sys.stdout.
os.utime(file_path.encode('utf-8'), (0, self.mtime))
os.utime(file_path.encode('utf-8'), (0,last_mod_mtime))

msg = f'downloaded {self.identifier}/{self.name} to {file_path}'
log.info(msg)
Expand Down
4 changes: 3 additions & 1 deletion tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,12 @@ def test_upload_validate_identifier():

def test_download(tmpdir):
tmpdir.chdir()
last_mod_header = {"Last-Modified": "Tue, 14 Nov 2023 20:25:48 GMT"}
with IaRequestsMock() as rsps:
rsps.add(responses.GET,
f'{PROTOCOL}//archive.org/download/nasa/nasa_meta.xml',
body='test content')
body='test content',
adding_headers=last_mod_header)
rsps.add_metadata_mock('nasa')
download('nasa', 'nasa_meta.xml')
p = os.path.join(str(tmpdir), 'nasa')
Expand Down
54 changes: 40 additions & 14 deletions tests/test_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
DOWNLOAD_URL_RE = re.compile(f'{PROTOCOL}//archive.org/download/.*')
S3_URL_RE = re.compile(r'.*s3.us.archive.org/.*')

EXPECTED_LAST_MOD_HEADER = {"Last-Modified": "Tue, 14 Nov 2023 20:25:48 GMT"}
EXPECTED_S3_HEADERS = {
'content-length': '7557',
'x-archive-queue-derive': '1',
Expand Down Expand Up @@ -145,11 +146,15 @@ def test_get_files_no_matches(nasa_item):
def test_download(tmpdir, nasa_item):
tmpdir.chdir()
with IaRequestsMock() as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml')
assert len(tmpdir.listdir()) == 1
with IaRequestsMock() as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='new test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml')
with open('nasa/nasa_meta.xml') as fh:
assert fh.read() == 'new test content'
Expand All @@ -158,7 +163,9 @@ def test_download(tmpdir, nasa_item):
def test_download_io_error(tmpdir, nasa_item):
tmpdir.chdir()
with IaRequestsMock() as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml')
rsps.reset()
with pytest.raises(ConnectionError):
Expand All @@ -167,7 +174,9 @@ def test_download_io_error(tmpdir, nasa_item):

def test_download_ignore_errors(tmpdir, nasa_item):
with IaRequestsMock() as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml')
nasa_item.download(files='nasa_meta.xml', ignore_errors=True)

Expand All @@ -177,11 +186,13 @@ def test_download_ignore_existing(tmpdir, nasa_item):
with IaRequestsMock(
assert_all_requests_are_fired=False) as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='test content')
body='test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml', ignore_existing=True)

rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='new test content')
body='new test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml', ignore_existing=True)
with open('nasa/nasa_meta.xml') as fh:
assert fh.read() == 'test content'
Expand All @@ -190,11 +201,15 @@ def test_download_ignore_existing(tmpdir, nasa_item):
def test_download_clobber(tmpdir, nasa_item):
tmpdir.chdir()
with IaRequestsMock() as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml')

rsps.reset()
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='new test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml')
assert load_file('nasa/nasa_meta.xml') == 'new test content'

Expand All @@ -205,8 +220,12 @@ def test_download_checksum(tmpdir, caplog):
# test overwrite based on checksum.
with IaRequestsMock() as rsps:
rsps.add_metadata_mock('nasa')
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='overwrite based on md5')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='overwrite based on md5',
adding_headers=EXPECTED_LAST_MOD_HEADER)

nasa_item = get_item('nasa')
nasa_item.download(files='nasa_meta.xml')
Expand All @@ -218,7 +237,8 @@ def test_download_checksum(tmpdir, caplog):
with caplog.at_level(logging.DEBUG):
rsps.reset()
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body=load_test_data_file('nasa_meta.xml'))
body=load_test_data_file('nasa_meta.xml'),
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml', checksum=True, verbose=True)
nasa_item.download(files='nasa_meta.xml', checksum=True, verbose=True)

Expand All @@ -229,7 +249,9 @@ def test_download_checksum(tmpdir, caplog):
def test_download_destdir(tmpdir, nasa_item):
tmpdir.chdir()
with IaRequestsMock() as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new destdir')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='new destdir',
adding_headers=EXPECTED_LAST_MOD_HEADER)
dest = os.path.join(str(tmpdir), 'new destdir')
nasa_item.download(files='nasa_meta.xml', destdir=dest)
assert 'nasa' in os.listdir(dest)
Expand All @@ -241,7 +263,9 @@ def test_download_no_directory(tmpdir, nasa_item):
url_re = re.compile(f'{PROTOCOL}//archive.org/download/.*')
tmpdir.chdir()
with IaRequestsMock() as rsps:
rsps.add(responses.GET, url_re, body='no dest dir')
rsps.add(responses.GET, url_re,
body='no dest dir',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml', no_directory=True)
with open(os.path.join(str(tmpdir), 'nasa_meta.xml')) as fh:
assert fh.read() == 'no dest dir'
Expand Down Expand Up @@ -278,9 +302,11 @@ def test_download_dry_run_on_the_fly_formats(tmpdir, capsys, nasa_item):
def test_download_verbose(tmpdir, capsys, nasa_item):
tmpdir.chdir()
with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
headers = {'content-length': '11'}
headers.update(EXPECTED_LAST_MOD_HEADER)
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='no dest dir',
adding_headers={'content-length': '11'})
adding_headers=headers)
nasa_item.download(files='nasa_meta.xml', verbose=True)
out, err = capsys.readouterr()
assert 'downloading nasa_meta.xml' in err
Expand Down

0 comments on commit 8c6d9b1

Please sign in to comment.