Merge pull request #614 from jjjake/last-mod-download

First pass at using Last-Modified header to set mtime
jjjake · Mar 28, 2024 · 8c6d9b1 · 8c6d9b1
2 parents 87bc289 + 6a8f6b5
commit 8c6d9b1
Show file tree

Hide file tree

Showing 3 changed files with 83 additions and 45 deletions.
diff --git a/internetarchive/files.py b/internetarchive/files.py
@@ -28,6 +28,7 @@
 import socket
 import sys
 from contextlib import nullcontext, suppress
+from email.utils import parsedate_to_datetime
 from urllib.parse import quote
 
 from requests.exceptions import (
@@ -218,33 +219,6 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
                 raise OSError(f'{destdir} is not a directory!')
             file_path = os.path.join(destdir, file_path)
 
-        if not return_responses and os.path.exists(file_path.encode('utf-8')):
-            if ignore_existing:
-                msg = f'skipping {file_path}, file already exists.'
-                log.info(msg)
-                if verbose:
-                    print(f' {msg}', file=sys.stderr)
-                return
-            elif checksum:
-                with open(file_path, 'rb') as fp:
-                    md5_sum = utils.get_md5(fp)
-
-                if md5_sum == self.md5:
-                    msg = f'skipping {file_path}, file already exists based on checksum.'
-                    log.info(msg)
-                    if verbose:
-                        print(f' {msg}', file=sys.stderr)
-                    return
-            elif not fileobj:
-                st = os.stat(file_path.encode('utf-8'))
-                if (st.st_mtime == self.mtime) and (st.st_size == self.size) \
-                        or self.name.endswith('_files.xml') and st.st_size != 0:
-                    msg = f'skipping {file_path}, file already exists based on length and date.'
-                    log.info(msg)
-                    if verbose:
-                        print(f' {msg}', file=sys.stderr)
-                    return
-
         parent_dir = os.path.dirname(file_path)
         try:
             if parent_dir != '' and return_responses is not True:
@@ -255,8 +229,44 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
                                              timeout=timeout,
                                              auth=self.auth,
                                              params=params)
+
+            # Get timestamp from Last-Modified header
+            dt = parsedate_to_datetime(response.headers['Last-Modified'])
+            last_mod_mtime = dt.timestamp()
+
             response.raise_for_status()
-            if return_responses:
+
+            # Check if we should skip...
+            if not return_responses and os.path.exists(file_path.encode('utf-8')):
+                if ignore_existing:
+                    msg = f'skipping {file_path}, file already exists.'
+                    log.info(msg)
+                    if verbose:
+                        print(f' {msg}', file=sys.stderr)
+                    return
+                elif checksum:
+                    with open(file_path, 'rb') as fp:
+                        md5_sum = utils.get_md5(fp)
+
+                    if md5_sum == self.md5:
+                        msg = f'skipping {file_path}, file already exists based on checksum.'
+                        log.info(msg)
+                        if verbose:
+                            print(f' {msg}', file=sys.stderr)
+                        return
+                elif not fileobj:
+                    st = os.stat(file_path.encode('utf-8'))
+                    if st.st_mtime == last_mod_mtime:
+                        if self.name == f'{self.identifier}_files.xml' \
+                            or (st.st_size == self.size):
+                            msg = (f'skipping {file_path}, file already exists based on '
+                                    'length and date.')
+                            log.info(msg)
+                            if verbose:
+                                print(f' {msg}', file=sys.stderr)
+                            return
+
+            elif return_responses:
                 return response
 
             if verbose:
@@ -298,11 +308,11 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
             else:
                 raise exc
 
-        # Set mtime with mtime from files.xml.
+        # Set mtime with timestamp from Last-Modified header
         if not no_change_timestamp:
             # If we want to set the timestamp to that of the original archive...
             with suppress(OSError):  # Probably file-like object, e.g. sys.stdout.
-                os.utime(file_path.encode('utf-8'), (0, self.mtime))
+                os.utime(file_path.encode('utf-8'), (0,last_mod_mtime))
 
         msg = f'downloaded {self.identifier}/{self.name} to {file_path}'
         log.info(msg)

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -266,10 +266,12 @@ def test_upload_validate_identifier():
 
 def test_download(tmpdir):
     tmpdir.chdir()
+    last_mod_header = {"Last-Modified": "Tue, 14 Nov 2023 20:25:48 GMT"}
     with IaRequestsMock() as rsps:
         rsps.add(responses.GET,
                  f'{PROTOCOL}//archive.org/download/nasa/nasa_meta.xml',
-                 body='test content')
+                 body='test content',
+                 adding_headers=last_mod_header)
         rsps.add_metadata_mock('nasa')
         download('nasa', 'nasa_meta.xml')
         p = os.path.join(str(tmpdir), 'nasa')

diff --git a/tests/test_item.py b/tests/test_item.py
@@ -24,6 +24,7 @@
 DOWNLOAD_URL_RE = re.compile(f'{PROTOCOL}//archive.org/download/.*')
 S3_URL_RE = re.compile(r'.*s3.us.archive.org/.*')
 
+EXPECTED_LAST_MOD_HEADER = {"Last-Modified": "Tue, 14 Nov 2023 20:25:48 GMT"}
 EXPECTED_S3_HEADERS = {
     'content-length': '7557',
     'x-archive-queue-derive': '1',
@@ -145,11 +146,15 @@ def test_get_files_no_matches(nasa_item):
 def test_download(tmpdir, nasa_item):
     tmpdir.chdir()
     with IaRequestsMock() as rsps:
-        rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
+        rsps.add(responses.GET, DOWNLOAD_URL_RE,
+                 body='test content',
+                 adding_headers=EXPECTED_LAST_MOD_HEADER)
         nasa_item.download(files='nasa_meta.xml')
         assert len(tmpdir.listdir()) == 1
     with IaRequestsMock() as rsps:
-        rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new test content')
+        rsps.add(responses.GET, DOWNLOAD_URL_RE,
+                 body='new test content',
+                 adding_headers=EXPECTED_LAST_MOD_HEADER)
         nasa_item.download(files='nasa_meta.xml')
         with open('nasa/nasa_meta.xml') as fh:
             assert fh.read() == 'new test content'
@@ -158,7 +163,9 @@ def test_download(tmpdir, nasa_item):
 def test_download_io_error(tmpdir, nasa_item):
     tmpdir.chdir()
     with IaRequestsMock() as rsps:
-        rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
+        rsps.add(responses.GET, DOWNLOAD_URL_RE,
+                 body='test content',
+                 adding_headers=EXPECTED_LAST_MOD_HEADER)
         nasa_item.download(files='nasa_meta.xml')
         rsps.reset()
         with pytest.raises(ConnectionError):
@@ -167,7 +174,9 @@ def test_download_io_error(tmpdir, nasa_item):
 
 def test_download_ignore_errors(tmpdir, nasa_item):
     with IaRequestsMock() as rsps:
-        rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
+        rsps.add(responses.GET, DOWNLOAD_URL_RE,
+                 body='test content',
+                 adding_headers=EXPECTED_LAST_MOD_HEADER)
         nasa_item.download(files='nasa_meta.xml')
         nasa_item.download(files='nasa_meta.xml', ignore_errors=True)
 
@@ -177,11 +186,13 @@ def test_download_ignore_existing(tmpdir, nasa_item):
     with IaRequestsMock(
             assert_all_requests_are_fired=False) as rsps:
         rsps.add(responses.GET, DOWNLOAD_URL_RE,
-                 body='test content')
+                 body='test content',
+                 adding_headers=EXPECTED_LAST_MOD_HEADER)
         nasa_item.download(files='nasa_meta.xml', ignore_existing=True)
 
         rsps.add(responses.GET, DOWNLOAD_URL_RE,
-                 body='new test content')
+                 body='new test content',
+                 adding_headers=EXPECTED_LAST_MOD_HEADER)
         nasa_item.download(files='nasa_meta.xml', ignore_existing=True)
         with open('nasa/nasa_meta.xml') as fh:
             assert fh.read() == 'test content'
@@ -190,11 +201,15 @@ def test_download_ignore_existing(tmpdir, nasa_item):
 def test_download_clobber(tmpdir, nasa_item):
     tmpdir.chdir()
     with IaRequestsMock() as rsps:
-        rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
+        rsps.add(responses.GET, DOWNLOAD_URL_RE,
+                 body='test content',
+                 adding_headers=EXPECTED_LAST_MOD_HEADER)
         nasa_item.download(files='nasa_meta.xml')
 
         rsps.reset()
-        rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new test content')
+        rsps.add(responses.GET, DOWNLOAD_URL_RE,
+                 body='new test content',
+                 adding_headers=EXPECTED_LAST_MOD_HEADER)
         nasa_item.download(files='nasa_meta.xml')
         assert load_file('nasa/nasa_meta.xml') == 'new test content'
 
@@ -205,8 +220,12 @@ def test_download_checksum(tmpdir, caplog):
     # test overwrite based on checksum.
     with IaRequestsMock() as rsps:
         rsps.add_metadata_mock('nasa')
-        rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
-        rsps.add(responses.GET, DOWNLOAD_URL_RE, body='overwrite based on md5')
+        rsps.add(responses.GET, DOWNLOAD_URL_RE,
+                 body='test content',
+                 adding_headers=EXPECTED_LAST_MOD_HEADER)
+        rsps.add(responses.GET, DOWNLOAD_URL_RE,
+                 body='overwrite based on md5',
+                 adding_headers=EXPECTED_LAST_MOD_HEADER)
 
         nasa_item = get_item('nasa')
         nasa_item.download(files='nasa_meta.xml')
@@ -218,7 +237,8 @@ def test_download_checksum(tmpdir, caplog):
         with caplog.at_level(logging.DEBUG):
             rsps.reset()
             rsps.add(responses.GET, DOWNLOAD_URL_RE,
-                     body=load_test_data_file('nasa_meta.xml'))
+                     body=load_test_data_file('nasa_meta.xml'),
+                     adding_headers=EXPECTED_LAST_MOD_HEADER)
             nasa_item.download(files='nasa_meta.xml', checksum=True, verbose=True)
             nasa_item.download(files='nasa_meta.xml', checksum=True, verbose=True)
 
@@ -229,7 +249,9 @@ def test_download_checksum(tmpdir, caplog):
 def test_download_destdir(tmpdir, nasa_item):
     tmpdir.chdir()
     with IaRequestsMock() as rsps:
-        rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new destdir')
+        rsps.add(responses.GET, DOWNLOAD_URL_RE,
+                 body='new destdir',
+                 adding_headers=EXPECTED_LAST_MOD_HEADER)
         dest = os.path.join(str(tmpdir), 'new destdir')
         nasa_item.download(files='nasa_meta.xml', destdir=dest)
         assert 'nasa' in os.listdir(dest)
@@ -241,7 +263,9 @@ def test_download_no_directory(tmpdir, nasa_item):
     url_re = re.compile(f'{PROTOCOL}//archive.org/download/.*')
     tmpdir.chdir()
     with IaRequestsMock() as rsps:
-        rsps.add(responses.GET, url_re, body='no dest dir')
+        rsps.add(responses.GET, url_re,
+                 body='no dest dir',
+                 adding_headers=EXPECTED_LAST_MOD_HEADER)
         nasa_item.download(files='nasa_meta.xml', no_directory=True)
         with open(os.path.join(str(tmpdir), 'nasa_meta.xml')) as fh:
             assert fh.read() == 'no dest dir'
@@ -278,9 +302,11 @@ def test_download_dry_run_on_the_fly_formats(tmpdir, capsys, nasa_item):
 def test_download_verbose(tmpdir, capsys, nasa_item):
     tmpdir.chdir()
     with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
+        headers = {'content-length': '11'}
+        headers.update(EXPECTED_LAST_MOD_HEADER)
         rsps.add(responses.GET, DOWNLOAD_URL_RE,
                  body='no dest dir',
-                 adding_headers={'content-length': '11'})
+                 adding_headers=headers)
         nasa_item.download(files='nasa_meta.xml', verbose=True)
         out, err = capsys.readouterr()
         assert 'downloading nasa_meta.xml' in err