18 enhance cli to support multiple urls (#19)

noexec · Mar 22, 2023 · 63416dd · 63416dd
2 parents 0942a20 + fbf139a
commit 63416dd
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 10 deletions.
diff --git a/src/curldl/cli.py b/src/curldl/cli.py
@@ -47,8 +47,8 @@ def _parse_arguments(cls) -> argparse.Namespace:
         parser.add_argument('-V', '--version', action='version', version=version, help='show program version and exit')
 
         parser.add_argument('-b', '--basedir', default='.', help='base download folder')
-        parser.add_argument('-o', '--output', help='basedir-relative path to the downloaded file, '
-                            'infer from URL if unspecified')
+        output_arg = parser.add_argument('-o', '--output', nargs=1, help='basedir-relative path to the '
+                                         'downloaded file, infer from URL if unspecified')
         parser.add_argument('-s', '--size', type=int, help='expected download file size')
 
         parser.add_argument('-a', '--algo', choices=hash_algos, default='sha256',
@@ -60,28 +60,31 @@ def _parse_arguments(cls) -> argparse.Namespace:
                             metavar='LEVEL', help='logging level: ' + ', '.join(log_choices))
         parser.add_argument('-v', '--verbose', action='store_true', help='log metadata and headers (implies -l debug)')
 
-        parser.add_argument('url', help='URL to download')
+        parser.add_argument('url', nargs='+', help='URL(s) to download')
 
         args = parser.parse_args()
         cls._configure_logger(args)
 
-        return cls._infer_arguments(args)
+        return cls._infer_arguments(output_arg, args)
 
     @classmethod
-    def _infer_arguments(cls, args: argparse.Namespace) -> argparse.Namespace:
+    def _infer_arguments(cls, output_arg: argparse.Action, args: argparse.Namespace) -> argparse.Namespace:
         """Infer missing arguments"""
         if not args.output:
-            url_path = urllib.parse.unquote(urllib.parse.urlparse(args.url).path)
-            args.output = os.path.basename(url_path)
-            log.info('Saving output to: %s', args.output)
+            args.output = [os.path.basename(urllib.parse.unquote(urllib.parse.urlparse(url).path)) for url in args.url]
+            log.info('Saving download(s) to: %s', ', '.join(args.output))
+
+        elif len(args.output) != len(args.url):
+            raise argparse.ArgumentError(output_arg, 'Cannot specify output file when downloading multiple URLs')
 
         return args
 
     def main(self) -> object:
         """Command-line program entry point"""
         downloader = Downloader(self.args.basedir, progress=self.args.progress, verbose=self.args.verbose)
-        downloader.download(self.args.url, rel_path=self.args.output, size=self.args.size,
-                            digests=self.args.digest and {self.args.algo: self.args.digest})
+        for url, output in zip(self.args.url, self.args.output):
+            downloader.download(url, rel_path=output, size=self.args.size,
+                                digests=self.args.digest and {self.args.algo: self.args.digest})
         return 0
 
     @staticmethod

diff --git a/tests/functional/test_cli.py b/tests/functional/test_cli.py
@@ -1,4 +1,5 @@
 """CLI entry point functional tests"""
+import argparse
 import inspect
 import logging
 import os.path
@@ -10,6 +11,7 @@
 import pytest
 import toml
 from _pytest.capture import CaptureFixture
+from _pytest.logging import LogCaptureFixture
 from pytest_httpserver import HTTPServer
 from pytest_mock import MockerFixture
 
@@ -107,3 +109,64 @@ def test_download_file(mocker: MockerFixture, tmp_path: pathlib.Path, httpserver
     assert file_path.is_file()
     with open(file_path, 'rb') as file:
         assert file.read() == file_data
+
+
+@pytest.mark.parametrize('specify_output_size', [False, True])
+@pytest.mark.parametrize('specify_output_digest', [False, True])
+def test_download_multiple_files(mocker: MockerFixture, caplog: LogCaptureFixture,
+                                 tmp_path: pathlib.Path, httpserver: HTTPServer,
+                                 specify_output_size: bool, specify_output_digest: bool) -> None:
+    """Verify that multiple file arguments are downloaded and verification functions correctly,
+    (including that digest verification is attempted on all files)"""
+    caplog.set_level(logging.DEBUG)
+
+    url_count = 5
+    file_datas = [bytes(chr(ord('x') + idx), 'ascii') * 128 for idx in range(url_count)]
+    file_names = [f'file{idx}.txt' for idx in range(url_count)]
+    file_digest = '150fa3fbdc899bd0b8f95a9fb6027f564d953762'
+    should_succeed = not specify_output_digest
+
+    arguments = ['-b', str(tmp_path), '-l', 'debug', '-v', '-p']
+    if specify_output_size:
+        arguments += ['-s', str(128)]
+    if specify_output_digest:
+        arguments += ['-a', 'sha1', '-d', file_digest]
+
+    for idx in range(url_count):
+        httpserver.expect_ordered_request(
+            url_path := f'/loc1/loc2/{file_names[idx]};a=b', query_string={'c': 'd'}).respond_with_data(file_datas[idx])
+        arguments += [httpserver.url_for(url_path + '?c=d#id')]
+
+    patch_logger_config(mocker, 'debug')
+    patch_system_environment(mocker, arguments)
+
+    if not should_succeed:
+        with pytest.raises(ValueError):
+            cli.main()
+    else:
+        status_code = cli.main()
+        assert status_code == 0
+
+    httpserver.check()
+    for idx in range(url_count):
+        file_path = tmp_path / file_names[idx]
+        assert file_path.exists() == should_succeed or idx == 0
+        if file_path.exists():
+            with open(file_path, 'rb') as file:
+                assert file.read() == file_datas[idx]
+
+
+def test_multiple_downloads_with_output_file(mocker: MockerFixture, tmp_path: pathlib.Path,
+                                             httpserver: HTTPServer) -> None:
+    """Verify that specifying output with multiple files results in exception and now download is attempted"""
+    arguments = ['-b', str(tmp_path), '-o', 'file.txt', '-l', 'critical',
+                 httpserver.url_for('/file1.txt'), httpserver.url_for('/file2.txt')]
+
+    patch_logger_config(mocker, 'critical')
+    patch_system_environment(mocker, arguments)
+
+    with pytest.raises(argparse.ArgumentError):
+        cli.main()
+
+    httpserver.check()
+    assert not os.listdir(tmp_path)
diff --git a/tests/unit/test_downloader.py b/tests/unit/test_downloader.py
@@ -205,9 +205,11 @@ def download_and_possibly_verify() -> None:
     download_and_possibly_verify()
     httpserver.check()
 
+    partial_timestamp = os.stat(tmp_path / 'file.bin').st_mtime
     assert read_file_content(tmp_path / 'file.bin') == file_data
     os.rename(tmp_path / 'file.bin', tmp_path / 'file.bin.part')
     os.truncate(tmp_path / 'file.bin.part', part_size)
+    os.utime(tmp_path / 'file.bin.part', times=(partial_timestamp, partial_timestamp))
 
     # Request #2 on partial file: generally fails with 503, which causes pycurl.error due to resume failure
     # Rolls back to existing partial file if verification data is present
@@ -302,3 +304,27 @@ def test_aborted_download(tmp_path: pathlib.Path, caplog: LogCaptureFixture) ->
             downloader.download(url, 'some-file.txt')
 
         assert ex_info.value.args[0] == pycurl.E_GOT_NOTHING
+
+
+@pytest.mark.parametrize('size', [100, 5000])
+@pytest.mark.parametrize('specify_size', [False, True])
+@pytest.mark.parametrize('always_keep_part_size', [0, 50, 2499, 2501, 8000])
+def test_partial_download_keep(tmp_path: pathlib.Path, httpserver: HTTPServer, caplog: LogCaptureFixture,
+                               size: int, specify_size: bool, always_keep_part_size: int) -> None:
+    """Verify policy of keeping partial download in absence of verification data"""
+    caplog.set_level(logging.DEBUG)
+    with open(tmp_path / 'file.txt.part', 'wb') as part_file:
+        part_file.write(b'x' * (size // 2))
+
+    httpserver.expect_oneshot_request('/file.txt').respond_with_handler(
+        make_range_response_handler('/file.txt', b'y' * size))
+
+    downloader = curldl.Downloader(basedir=tmp_path, verbose=True, retry_attempts=0, min_part_bytes=0,
+                                   min_always_keep_part_bytes=always_keep_part_size)
+    downloader.download(httpserver.url_for('/file.txt'), 'file.txt', size=(size if specify_size else None))
+    httpserver.check()
+
+    assert not (tmp_path / 'file.txt.part').exists()
+    assert read_file_content(tmp_path / 'file.txt') == ((b'x' * (size // 2) + b'y' * (size // 2))
+                                                        if specify_size or always_keep_part_size <= size // 2
+                                                        else b'y' * size)