From 8be3efb6e4adff06e77945d2a1af6f23fed5ad63 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 8 Jun 2021 13:08:39 +1000 Subject: [PATCH 01/38] Fix bug with Imgur gifs being shortened too much The rstrip function was used wrongly, it doesn't remove a substring but rather removes any of the characters provided, so here it removed any I, G, V, or F that finished the six character ID for Imgur, resulting in a 404 error for the resources in question. --- bdfr/site_downloaders/imgur.py | 2 +- tests/site_downloaders/test_imgur.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 3d071d45..bd974be6 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -39,7 +39,7 @@ def _compute_image_url(self, image: dict) -> Resource: def _get_data(link: str) -> dict: if re.match(r'.*\.gifv$', link): link = link.replace('i.imgur', 'imgur') - link = link.rstrip('.gifv') + link = re.sub('\\.gifv$', '', link) res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'}) diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 792926a1..0e557edb 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -130,6 +130,12 @@ def test_imgur_extension_validation_bad(test_extension: str): 'fb6c913d721c0bbb96aa65d7f560d385', ), ), + ( + 'https://i.imgur.com/lFJai6i.gifv', + ( + '01a6e79a30bec0e644e5da12365d5071', + ), + ) )) def test_find_resources(test_url: str, expected_hashes: list[str]): mock_download = Mock() From 8ba2d0bb555d059acd216222cb16c5d76d0d3942 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 10 Jun 2021 18:59:22 +1000 Subject: [PATCH 02/38] Add missing return statement --- bdfr/downloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 61158a32..a0d8834d 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -104,6 +104,7 @@ def _download_submission(self, submission: praw.models.Submission): except OSError as e: logger.exception(e) logger.error(f'Failed to write file to {destination} in submission {submission.id}: {e}') + return creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple()) os.utime(destination, (creation_time, creation_time)) self.master_hash_list[resource_hash] = destination From 6eeadc88214bf3b5aff8c893ac6a338b38d26187 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 11 Jun 2021 15:31:11 +1000 Subject: [PATCH 03/38] Add option for archiver full context --- bdfr/__main__.py | 1 + bdfr/archiver.py | 3 +++ bdfr/configuration.py | 3 ++- tests/test_integration.py | 14 ++++++++++++++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 11035819..6312c769 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -50,6 +50,7 @@ _archiver_options = [ click.option('--all-comments', is_flag=True, default=None), + click.option('--full-context', is_flag=True, default=None), click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None), ] diff --git a/bdfr/archiver.py b/bdfr/archiver.py index b19a0420..f2870cc9 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -61,6 +61,9 @@ def _pull_lever_entry_factory(praw_item: (praw.models.Submission, praw.models.Co raise ArchiverError(f'Factory failed to classify item of type {type(praw_item).__name__}') def write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)): + if self.args.full_context and isinstance(praw_item, praw.models.Comment): + logger.debug(f'Converting comment {praw_item.id} to submission {praw_item.submission.id}') + praw_item = praw_item.submission archive_entry = self._pull_lever_entry_factory(praw_item) if self.args.format == 'json': self._write_entry_json(archive_entry) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 327a4536..558b79f4 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -41,8 +41,9 @@ def __init__(self): self.verbose: int = 0 # Archiver-specific options - self.format = 'json' self.all_comments = False + self.format = 'json' + self.full_context: bool = False def process_click_arguments(self, context: click.Context): for arg_key in context.params.keys(): diff --git a/tests/test_integration.py b/tests/test_integration.py index 6a9e52bf..0a6de3dd 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -252,6 +252,20 @@ def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--full-context', '--link', 'gxqapql'], +)) +def test_cli_archive_full_context(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Converting comment' in result.output + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.slow From e500bc4ad4be19dd5d63ca7c7c1f4d7ccc51f5b3 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 11 Jun 2021 15:35:12 +1000 Subject: [PATCH 04/38] Update README --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index a06e4af5..bf6d4f9b 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,9 @@ The following options are for the `archive` command specifically. - `json` (default) - `xml` - `yaml` +- `--full-context` + - This option will, instead of downloading an individual comment, download the submission that comment is a part of + - May result in a longer run time as it retrieves much more data ### Cloner Options From 9fd8b29833fa5f87ebdb6c4786f6489e79f1c297 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 11 Jun 2021 18:36:40 +1000 Subject: [PATCH 05/38] Add another logging message to script --- scripts/extract_failed_ids.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index 89f1896c..104c7af5 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -19,4 +19,5 @@ fi grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; grep 'Failed to write file' "$file" | awk '{ print $16 }' | rev | cut -c 2- | rev ; + grep 'skipped due to disabled module' "$file" | awk '{ print $9 }' ; } >>"$output" From b4ae513e7105b2f17d85227bbe65c17f91677b35 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 11 Jun 2021 18:40:26 +1000 Subject: [PATCH 06/38] Add submodules for bash testing --- .gitmodules | 9 +++++++++ scripts/tests/bats | 1 + scripts/tests/test_helper/bats-assert | 1 + scripts/tests/test_helper/bats-support | 1 + 4 files changed, 12 insertions(+) create mode 100644 .gitmodules create mode 160000 scripts/tests/bats create mode 160000 scripts/tests/test_helper/bats-assert create mode 160000 scripts/tests/test_helper/bats-support diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..ea9e094e --- /dev/null +++ b/.gitmodules @@ -0,0 +1,9 @@ +[submodule "scripts/tests/bats"] + path = scripts/tests/bats + url = https://github.com/bats-core/bats-core.git +[submodule "scripts/tests/test_helper/bats-assert"] + path = scripts/tests/test_helper/bats-assert + url = https://github.com/bats-core/bats-assert.git +[submodule "scripts/tests/test_helper/bats-support"] + path = scripts/tests/test_helper/bats-support + url = https://github.com/bats-core/bats-support.git diff --git a/scripts/tests/bats b/scripts/tests/bats new file mode 160000 index 00000000..ce5ca280 --- /dev/null +++ b/scripts/tests/bats @@ -0,0 +1 @@ +Subproject commit ce5ca2802fabe5dc38393240cd40e20f8928d3b0 diff --git a/scripts/tests/test_helper/bats-assert b/scripts/tests/test_helper/bats-assert new file mode 160000 index 00000000..e0de84e9 --- /dev/null +++ b/scripts/tests/test_helper/bats-assert @@ -0,0 +1 @@ +Subproject commit e0de84e9c011223e7f88b7ccf1c929f4327097ba diff --git a/scripts/tests/test_helper/bats-support b/scripts/tests/test_helper/bats-support new file mode 160000 index 00000000..d140a650 --- /dev/null +++ b/scripts/tests/test_helper/bats-support @@ -0,0 +1 @@ +Subproject commit d140a65044b2d6810381935ae7f0c94c7023c8c3 From e009fab5047315356a1bf083e847c9de2f4fd6db Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 12 Jun 2021 08:41:38 +1000 Subject: [PATCH 07/38] Add empty files --- scripts/tests/README.md | 0 scripts/tests/test_extract_failed_ids.sh | 0 scripts/tests/test_extract_successful_ids.sh | 0 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 scripts/tests/README.md create mode 100644 scripts/tests/test_extract_failed_ids.sh create mode 100644 scripts/tests/test_extract_successful_ids.sh diff --git a/scripts/tests/README.md b/scripts/tests/README.md new file mode 100644 index 00000000..e69de29b diff --git a/scripts/tests/test_extract_failed_ids.sh b/scripts/tests/test_extract_failed_ids.sh new file mode 100644 index 00000000..e69de29b diff --git a/scripts/tests/test_extract_successful_ids.sh b/scripts/tests/test_extract_successful_ids.sh new file mode 100644 index 00000000..e69de29b From c5c010bce025fe4cf56c1ccfc40c27eb05db2c11 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 12 Jun 2021 10:35:31 +1000 Subject: [PATCH 08/38] Rename option --- README.md | 2 +- bdfr/__main__.py | 2 +- bdfr/archiver.py | 2 +- bdfr/configuration.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index bf6d4f9b..be4f4559 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ The following options are for the `archive` command specifically. - `json` (default) - `xml` - `yaml` -- `--full-context` +- `--comment-context` - This option will, instead of downloading an individual comment, download the submission that comment is a part of - May result in a longer run time as it retrieves much more data diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 6312c769..67e4f997 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -50,7 +50,7 @@ _archiver_options = [ click.option('--all-comments', is_flag=True, default=None), - click.option('--full-context', is_flag=True, default=None), + click.option('--comment-context', is_flag=True, default=None), click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None), ] diff --git a/bdfr/archiver.py b/bdfr/archiver.py index f2870cc9..74b92e86 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -61,7 +61,7 @@ def _pull_lever_entry_factory(praw_item: (praw.models.Submission, praw.models.Co raise ArchiverError(f'Factory failed to classify item of type {type(praw_item).__name__}') def write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)): - if self.args.full_context and isinstance(praw_item, praw.models.Comment): + if self.args.comment_context and isinstance(praw_item, praw.models.Comment): logger.debug(f'Converting comment {praw_item.id} to submission {praw_item.submission.id}') praw_item = praw_item.submission archive_entry = self._pull_lever_entry_factory(praw_item) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 558b79f4..36a18608 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -43,7 +43,7 @@ def __init__(self): # Archiver-specific options self.all_comments = False self.format = 'json' - self.full_context: bool = False + self.comment_context: bool = False def process_click_arguments(self, context: click.Context): for arg_key in context.params.keys(): From a8bc4f999e603f3f5e4555569c65d3cfaeb509eb Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 12 Jun 2021 10:41:50 +1000 Subject: [PATCH 09/38] Rename files to proper extension --- .../{test_extract_failed_ids.sh => test_extract_failed_ids.bats} | 0 ...extract_successful_ids.sh => test_extract_successful_ids.bats} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename scripts/tests/{test_extract_failed_ids.sh => test_extract_failed_ids.bats} (100%) rename scripts/tests/{test_extract_successful_ids.sh => test_extract_successful_ids.bats} (100%) diff --git a/scripts/tests/test_extract_failed_ids.sh b/scripts/tests/test_extract_failed_ids.bats similarity index 100% rename from scripts/tests/test_extract_failed_ids.sh rename to scripts/tests/test_extract_failed_ids.bats diff --git a/scripts/tests/test_extract_successful_ids.sh b/scripts/tests/test_extract_successful_ids.bats similarity index 100% rename from scripts/tests/test_extract_successful_ids.sh rename to scripts/tests/test_extract_successful_ids.bats From 7c27b7bf127b38b8c8f020ef5a645836b33fc41a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 13 Jun 2021 09:49:42 +1000 Subject: [PATCH 10/38] Update logging message --- bdfr/downloader.py | 2 +- scripts/extract_failed_ids.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index a0d8834d..ab6bf561 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -103,7 +103,7 @@ def _download_submission(self, submission: praw.models.Submission): logger.debug(f'Written file to {destination}') except OSError as e: logger.exception(e) - logger.error(f'Failed to write file to {destination} in submission {submission.id}: {e}') + logger.error(f'Failed to write file in submission {submission.id} to {destination}: {e}') return creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple()) os.utime(destination, (creation_time, creation_time)) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index 104c7af5..f96bd9a6 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -11,13 +11,13 @@ if [ -n "$2" ]; then output="$2" echo "Outputting IDs to $output" else - output="failed.txt" + output="./failed.txt" fi { grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; - grep 'Failed to write file' "$file" | awk '{ print $16 }' | rev | cut -c 2- | rev ; + grep 'Failed to write file' "$file" | awk '{ print $13 }' | rev | cut -c 2- | rev ; grep 'skipped due to disabled module' "$file" | awk '{ print $9 }' ; } >>"$output" From 72238f39bac5c0b3f5041a479bceb532f3d5fabc Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 13 Jun 2021 09:49:57 +1000 Subject: [PATCH 11/38] Update script --- scripts/extract_successful_ids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/extract_successful_ids.sh b/scripts/extract_successful_ids.sh index 19e8bd75..011ba6c0 100755 --- a/scripts/extract_successful_ids.sh +++ b/scripts/extract_successful_ids.sh @@ -11,7 +11,7 @@ if [ -n "$2" ]; then output="$2" echo "Outputting IDs to $output" else - output="successful.txt" + output="./successful.txt" fi { From 6755d15675b7cebaa98596507ab95fe7f259ab5a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 13 Jun 2021 09:50:41 +1000 Subject: [PATCH 12/38] Add tests for bash scripts --- .../failed_disabled_module.txt | 1 + .../example_logfiles/failed_no_downloader.txt | 3 ++ .../failed_resource_error.txt | 2 + .../failed_sitedownloader_error.txt | 2 + .../example_logfiles/failed_write_error.txt | 1 + .../succeed_already_exists.txt | 3 ++ .../succeed_download_filter.txt | 3 ++ .../succeed_downloaded_submission.txt | 7 +++ .../example_logfiles/succeed_hard_link.txt | 1 + .../succeed_resource_hash.txt | 1 + scripts/tests/test_extract_failed_ids.bats | 43 +++++++++++++++++++ .../tests/test_extract_successful_ids.bats | 38 ++++++++++++++++ 12 files changed, 105 insertions(+) create mode 100644 scripts/tests/example_logfiles/failed_disabled_module.txt create mode 100644 scripts/tests/example_logfiles/failed_no_downloader.txt create mode 100644 scripts/tests/example_logfiles/failed_resource_error.txt create mode 100644 scripts/tests/example_logfiles/failed_sitedownloader_error.txt create mode 100644 scripts/tests/example_logfiles/failed_write_error.txt create mode 100644 scripts/tests/example_logfiles/succeed_already_exists.txt create mode 100644 scripts/tests/example_logfiles/succeed_download_filter.txt create mode 100644 scripts/tests/example_logfiles/succeed_downloaded_submission.txt create mode 100644 scripts/tests/example_logfiles/succeed_hard_link.txt create mode 100644 scripts/tests/example_logfiles/succeed_resource_hash.txt diff --git a/scripts/tests/example_logfiles/failed_disabled_module.txt b/scripts/tests/example_logfiles/failed_disabled_module.txt new file mode 100644 index 00000000..50fd5524 --- /dev/null +++ b/scripts/tests/example_logfiles/failed_disabled_module.txt @@ -0,0 +1 @@ +[2021-06-12 12:49:18,452 - bdfr.downloader - DEBUG] - Submission m2601g skipped due to disabled module Direct diff --git a/scripts/tests/example_logfiles/failed_no_downloader.txt b/scripts/tests/example_logfiles/failed_no_downloader.txt new file mode 100644 index 00000000..511d11f3 --- /dev/null +++ b/scripts/tests/example_logfiles/failed_no_downloader.txt @@ -0,0 +1,3 @@ +[2021-06-12 11:13:35,665 - bdfr.downloader - ERROR] - Could not download submission nxv3ew: No downloader module exists for url https://www.biorxiv.org/content/10.1101/2021.06.11.447961v1?rss=1 +[2021-06-12 11:14:21,958 - bdfr.downloader - ERROR] - Could not download submission nxv3ek: No downloader module exists for url https://alkossegyedit.hu/termek/pluss-macko-poloval-20cm/?feed_id=34832&_unique_id=60c40a1190ccb&utm_source=Reddit&utm_medium=AEAdmin&utm_campaign=Poster +[2021-06-12 11:17:53,456 - bdfr.downloader - ERROR] - Could not download submission nxv3ea: No downloader module exists for url https://www.biorxiv.org/content/10.1101/2021.06.11.448067v1?rss=1 diff --git a/scripts/tests/example_logfiles/failed_resource_error.txt b/scripts/tests/example_logfiles/failed_resource_error.txt new file mode 100644 index 00000000..c2ba24c8 --- /dev/null +++ b/scripts/tests/example_logfiles/failed_resource_error.txt @@ -0,0 +1,2 @@ +[2021-06-12 11:18:25,794 - bdfr.downloader - ERROR] - Failed to download resource https://i.redd.it/61fniokpjq471.jpg in submission nxv3dt with downloader Direct: Unrecoverable error requesting resource: HTTP Code 404 + diff --git a/scripts/tests/example_logfiles/failed_sitedownloader_error.txt b/scripts/tests/example_logfiles/failed_sitedownloader_error.txt new file mode 100644 index 00000000..379ddacd --- /dev/null +++ b/scripts/tests/example_logfiles/failed_sitedownloader_error.txt @@ -0,0 +1,2 @@ +[2021-06-12 08:38:35,657 - bdfr.downloader - ERROR] - Site Gallery failed to download submission nxr7x9: No images found in Reddit gallery +[2021-06-12 08:47:22,005 - bdfr.downloader - ERROR] - Site Gallery failed to download submission nxpn0h: Server responded with 503 to https://www.reddit.com/gallery/nxpkvh diff --git a/scripts/tests/example_logfiles/failed_write_error.txt b/scripts/tests/example_logfiles/failed_write_error.txt new file mode 100644 index 00000000..24623668 --- /dev/null +++ b/scripts/tests/example_logfiles/failed_write_error.txt @@ -0,0 +1 @@ +[2021-06-09 22:01:04,530 - bdfr.downloader - ERROR] - Failed to write file in submission nnboza to C:\Users\Yoga 14\path\to\output\ThotNetwork\KatieCarmine_I POST A NEW VIDEO ALMOST EVERYDAY AND YOU NEVER HAVE TO PAY EXTRA FOR IT! I want to share my sex life with you! Only $6 per month and you get full access to over 400 videos of me getting fuck_nnboza.mp4: [Errno 2] No such file or directory: 'C:\\Users\\Yoga 14\\path\\to\\output\\ThotNetwork\\KatieCarmine_I POST A NEW VIDEO ALMOST EVERYDAY AND YOU NEVER HAVE TO PAY EXTRA FOR IT! I want to share my sex life with you! Only $6 per month and you get full access to over 400 videos of me getting fuck_nnboza.mp4' diff --git a/scripts/tests/example_logfiles/succeed_already_exists.txt b/scripts/tests/example_logfiles/succeed_already_exists.txt new file mode 100644 index 00000000..e5713d7d --- /dev/null +++ b/scripts/tests/example_logfiles/succeed_already_exists.txt @@ -0,0 +1,3 @@ +[2021-06-12 08:41:51,464 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxry0l.jpg from submission nxry0l already exists, continuing +[2021-06-12 08:41:51,469 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxrlgn.gif from submission nxrlgn already exists, continuing +[2021-06-12 08:41:51,472 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxrq9g.png from submission nxrq9g already exists, continuing diff --git a/scripts/tests/example_logfiles/succeed_download_filter.txt b/scripts/tests/example_logfiles/succeed_download_filter.txt new file mode 100644 index 00000000..ce4c41dd --- /dev/null +++ b/scripts/tests/example_logfiles/succeed_download_filter.txt @@ -0,0 +1,3 @@ +[2021-06-10 20:36:48,722 - bdfr.downloader - DEBUG] - Download filter removed nwfirr with URL https://www.youtube.com/watch?v=NVSiX0Tsees +[2021-06-12 19:56:36,848 - bdfr.downloader - DEBUG] - Download filter removed nwfgcl with URL https://www.reddit.com/r/MaliciousCompliance/comments/nwfgcl/new_guy_decided_to_play_manager_alright/ +[2021-06-12 19:56:28,587 - bdfr.downloader - DEBUG] - Download filter removed nxuxjy with URL https://www.reddit.com/r/MaliciousCompliance/comments/nxuxjy/you_want_an_omelette_with_nothing_inside_okay/ diff --git a/scripts/tests/example_logfiles/succeed_downloaded_submission.txt b/scripts/tests/example_logfiles/succeed_downloaded_submission.txt new file mode 100644 index 00000000..fde97fae --- /dev/null +++ b/scripts/tests/example_logfiles/succeed_downloaded_submission.txt @@ -0,0 +1,7 @@ +[2021-06-12 11:58:53,864 - bdfr.downloader - INFO] - Downloaded submission nxui9y from tumblr +[2021-06-12 11:58:56,618 - bdfr.downloader - INFO] - Downloaded submission nxsr4r from tumblr +[2021-06-12 11:58:59,026 - bdfr.downloader - INFO] - Downloaded submission nxviir from tumblr +[2021-06-12 11:59:00,289 - bdfr.downloader - INFO] - Downloaded submission nxusva from tumblr +[2021-06-12 11:59:00,735 - bdfr.downloader - INFO] - Downloaded submission nxvko7 from tumblr +[2021-06-12 11:59:01,215 - bdfr.downloader - INFO] - Downloaded submission nxvd63 from tumblr +[2021-06-12 11:59:13,891 - bdfr.downloader - INFO] - Downloaded submission nn9cor from tumblr diff --git a/scripts/tests/example_logfiles/succeed_hard_link.txt b/scripts/tests/example_logfiles/succeed_hard_link.txt new file mode 100644 index 00000000..6359f6ba --- /dev/null +++ b/scripts/tests/example_logfiles/succeed_hard_link.txt @@ -0,0 +1 @@ +[2021-06-11 17:33:02,118 - bdfr.downloader - INFO] - Hard link made linking /media/smaug/private/reddit/tumblr/nwnp2n.jpg to /media/smaug/private/reddit/tumblr/nwskqb.jpg in submission nwnp2n diff --git a/scripts/tests/example_logfiles/succeed_resource_hash.txt b/scripts/tests/example_logfiles/succeed_resource_hash.txt new file mode 100644 index 00000000..a0897505 --- /dev/null +++ b/scripts/tests/example_logfiles/succeed_resource_hash.txt @@ -0,0 +1 @@ +[2021-06-11 17:33:02,118 - bdfr.downloader - INFO] - Resource hash aaaaaaaaaaaaaaaaaaaaaaa from submission n86jk8 downloaded elsewhere diff --git a/scripts/tests/test_extract_failed_ids.bats b/scripts/tests/test_extract_failed_ids.bats index e69de29b..75b9bff8 100644 --- a/scripts/tests/test_extract_failed_ids.bats +++ b/scripts/tests/test_extract_failed_ids.bats @@ -0,0 +1,43 @@ +setup() { + load ./test_helper/bats-support/load + load ./test_helper/bats-assert/load +} + +teardown() { + rm -f failed.txt +} + +@test "fail run no logfile" { + run ../extract_failed_ids.sh + assert_failure +} + +@test "fail no downloader module" { + run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt + assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "3" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; +} + +@test "fail resource error" { + run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt + assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; +} + +@test "fail site downloader error" { + run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt + assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "2" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; +} + +@test "fail failed file write" { + run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt + assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; +} + +@test "fail disabled module" { + run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt + assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; +} diff --git a/scripts/tests/test_extract_successful_ids.bats b/scripts/tests/test_extract_successful_ids.bats index e69de29b..364bedbe 100644 --- a/scripts/tests/test_extract_successful_ids.bats +++ b/scripts/tests/test_extract_successful_ids.bats @@ -0,0 +1,38 @@ +setup() { + load ./test_helper/bats-support/load + load ./test_helper/bats-assert/load +} + +teardown() { + rm -f successful.txt +} + +@test "success downloaded submission" { + run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt + assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "7" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; +} + +@test "success resource hash" { + run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt + assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; +} + +@test "success download filter" { + run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt + assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; +} + +@test "success already exists" { + run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt + assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; +} + +@test "success hard link" { + run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt + assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; +} From fc42587a8f34367cffeca324cc00d8cef0105df5 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 13 Jun 2021 13:10:13 +1000 Subject: [PATCH 13/38] Add information to sub-README --- scripts/tests/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scripts/tests/README.md b/scripts/tests/README.md index e69de29b..8349f7a4 100644 --- a/scripts/tests/README.md +++ b/scripts/tests/README.md @@ -0,0 +1,13 @@ +# Bash Scripts Testing + +The `bats` framework is included and used to test the scripts included, specifically the scripts designed to parse through the logging output. As this involves delicate regex and indexes, it is necessary to test these. + +## Running Tests + +Running the tests are easy, and can be done with a single command. Once the working directory is this directory, run the following command. + +```bash +./bats/bin/bats *.bats +``` + +This will run all test files that have the `.bats` suffix. From e5be624f1e2f4d985f50b53c3c72ab78e6988721 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 23 Jun 2021 14:30:39 +1000 Subject: [PATCH 14/38] Check submission URL against filter before factory --- bdfr/downloader.py | 5 ++++- tests/test_integration.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index ab6bf561..f4220dbd 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -54,6 +54,9 @@ def _download_submission(self, submission: praw.models.Submission): elif not isinstance(submission, praw.models.Submission): logger.warning(f'{submission.id} is not a submission') return + elif not self.download_filter.check_url(submission.url): + logger.debug(f'Submission {submission.id} filtered due to URL {submission.url}') + return logger.debug(f'Attempting to download submission {submission.id}') try: @@ -76,7 +79,7 @@ def _download_submission(self, submission: praw.models.Submission): logger.debug(f'File {destination} from submission {submission.id} already exists, continuing') continue elif not self.download_filter.check_resource(res): - logger.debug(f'Download filter removed {submission.id} with URL {submission.url}') + logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}') continue try: res.download(self.args.max_wait_time) diff --git a/tests/test_integration.py b/tests/test_integration.py index 0a6de3dd..6bad3f64 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -189,6 +189,20 @@ def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): assert 'Download filter removed ' in result.output +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'tumblr', '-L', '10', '--skip-domain', 'i.redd.it'], +)) +def test_cli_download_download_filter_domain(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'filtered due to URL' in result.output + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.slow From ccafebf5fede757198a03cd460d887d13e15fedb Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 23 Jun 2021 14:59:26 +1000 Subject: [PATCH 15/38] Update test --- tests/site_downloaders/test_youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index 986b0dbc..afaebb72 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -14,7 +14,7 @@ @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://www.youtube.com/watch?v=uSm2VDgRIUs', 'f70b704b4b78b9bb5cd032bfc26e4971'), - ('https://www.youtube.com/watch?v=m-tKnjFwleU', '30314930d853afff8ebc7d8c36a5b833'), + ('https://www.youtube.com/watch?v=GcI7nxQj7HA', '2bfdbf434ed284623e46f3bf52c36166'), )) def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() From 3dacaf0872cbb61b29dd73506b74dc758c77732e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 24 Jun 2021 16:12:41 +1000 Subject: [PATCH 16/38] Fix renamed option in test --- tests/test_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 6bad3f64..3ecfcd6f 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -270,7 +270,7 @@ def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['--full-context', '--link', 'gxqapql'], + ['--comment-context', '--link', 'gxqapql'], )) def test_cli_archive_full_context(test_args: list[str], tmp_path: Path): runner = CliRunner() From 31be3a916e1bd4f8d77a137eba3589d80b611f48 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 24 Jun 2021 16:14:05 +1000 Subject: [PATCH 17/38] Enable integration tests to be run concurrently --- tests/test_integration.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 3ecfcd6f..0b4d36b7 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -2,6 +2,7 @@ # coding=utf-8 import re +import shutil from pathlib import Path import pytest @@ -12,22 +13,28 @@ does_test_config_exist = Path('test_config.cfg').exists() +def copy_test_config(tmp_path: Path): + shutil.copy(Path('test_config.cfg'), Path(tmp_path, 'test_config.cfg')) + + def create_basic_args_for_download_runner(test_args: list[str], tmp_path: Path): + copy_test_config(tmp_path) out = [ 'download', str(tmp_path), '-v', - '--config', 'test_config.cfg', + '--config', str(Path(tmp_path, 'test_config.cfg')), '--log', str(Path(tmp_path, 'test_log.txt')), ] + test_args return out def create_basic_args_for_archive_runner(test_args: list[str], tmp_path: Path): + copy_test_config(tmp_path) out = [ 'archive', str(tmp_path), '-v', - '--config', 'test_config.cfg', + '--config', str(Path(tmp_path, 'test_config.cfg')), '--log', str(Path(tmp_path, 'test_log.txt')), ] + test_args return out From 1d187fcf656f8237d73c289af9f45d5e95ac3ad3 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 24 Jun 2021 16:14:21 +1000 Subject: [PATCH 18/38] Consolidate tests --- tests/test_integration.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 0b4d36b7..5465c5be 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -187,27 +187,14 @@ def test_cli_download_search_existing(test_args: list[str], tmp_path: Path): @pytest.mark.parametrize('test_args', ( ['--subreddit', 'tumblr', '-L', '25', '--skip', 'png', '--skip', 'jpg'], ['--subreddit', 'MaliciousCompliance', '-L', '25', '--skip', 'txt'], -)) -def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_download_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - assert 'Download filter removed ' in result.output - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( ['--subreddit', 'tumblr', '-L', '10', '--skip-domain', 'i.redd.it'], )) -def test_cli_download_download_filter_domain(test_args: list[str], tmp_path: Path): +def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): runner = CliRunner() test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 - assert 'filtered due to URL' in result.output + assert any((string in result.output for string in ('Download filter removed ', 'filtered due to URL'))) @pytest.mark.online From 640001a7f57d186032822c4482d5b9f0f349be5b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 24 Jun 2021 16:37:25 +1000 Subject: [PATCH 19/38] Speed up test --- tests/test_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 5465c5be..19d884d5 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -170,7 +170,7 @@ def test_cli_download_user_data_bad_me_unauthenticated(test_args: list[str], tmp @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['--subreddit', 'python', '-L', 10, '--search-existing'], + ['--subreddit', 'python', '-L', 1, '--search-existing'], )) def test_cli_download_search_existing(test_args: list[str], tmp_path: Path): Path(tmp_path, 'test.txt').touch() From 8b1a3d9abcc83e39cfdc1c58702d7686d46c5aba Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 24 Jun 2021 16:38:34 +1000 Subject: [PATCH 20/38] Enable integration tests to be run concurrently --- tests/test_integration.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 19d884d5..5f1dfea6 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -13,29 +13,29 @@ does_test_config_exist = Path('test_config.cfg').exists() -def copy_test_config(tmp_path: Path): - shutil.copy(Path('test_config.cfg'), Path(tmp_path, 'test_config.cfg')) +def copy_test_config(run_path: Path): + shutil.copy(Path('test_config.cfg'), Path(run_path, 'test_config.cfg')) -def create_basic_args_for_download_runner(test_args: list[str], tmp_path: Path): - copy_test_config(tmp_path) +def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): + copy_test_config(run_path) out = [ - 'download', str(tmp_path), + 'download', str(run_path), '-v', - '--config', str(Path(tmp_path, 'test_config.cfg')), - '--log', str(Path(tmp_path, 'test_log.txt')), + '--config', str(Path(run_path, 'test_config.cfg')), + '--log', str(Path(run_path, 'test_log.txt')), ] + test_args return out -def create_basic_args_for_archive_runner(test_args: list[str], tmp_path: Path): - copy_test_config(tmp_path) +def create_basic_args_for_archive_runner(test_args: list[str], run_path: Path): + copy_test_config(run_path) out = [ 'archive', - str(tmp_path), + str(run_path), '-v', - '--config', str(Path(tmp_path, 'test_config.cfg')), - '--log', str(Path(tmp_path, 'test_log.txt')), + '--config', str(Path(run_path, 'test_config.cfg')), + '--log', str(Path(run_path, 'test_log.txt')), ] + test_args return out From 1a52dfdcbcd99e10a5f4ef28869b8b3f893ac51e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 25 Jun 2021 17:47:49 +1000 Subject: [PATCH 21/38] Add PornHub module --- bdfr/site_downloaders/download_factory.py | 3 ++ bdfr/site_downloaders/pornhub.py | 30 +++++++++++++++++++ .../site_downloaders/test_download_factory.py | 2 ++ tests/site_downloaders/test_pornhub.py | 25 ++++++++++++++++ 4 files changed, 60 insertions(+) create mode 100644 bdfr/site_downloaders/pornhub.py create mode 100644 tests/site_downloaders/test_pornhub.py diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 41813f9a..911e8fbe 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -13,6 +13,7 @@ from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur +from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost from bdfr.site_downloaders.youtube import Youtube @@ -43,6 +44,8 @@ def pull_lever(url: str) -> Type[BaseDownloader]: return Youtube elif re.match(r'i\.redd\.it.*', sanitised_url): return Direct + elif re.match(r'pornhub\.com.*', sanitised_url): + return PornHub elif YoutubeDlFallback.can_handle_link(sanitised_url): return YoutubeDlFallback else: diff --git a/bdfr/site_downloaders/pornhub.py b/bdfr/site_downloaders/pornhub.py new file mode 100644 index 00000000..924a6b8f --- /dev/null +++ b/bdfr/site_downloaders/pornhub.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import logging +import tempfile +from pathlib import Path +from typing import Optional + +import youtube_dl +from praw.models import Submission + +from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.youtube import Youtube + +logger = logging.getLogger(__name__) + + +class PornHub(Youtube): + def __init__(self, post: Submission): + super().__init__(post) + + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: + ytdl_options = { + 'format': 'best', + 'nooverwrites': True, + } + out = self._download_video(ytdl_options) + return [out] diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index 4b5356c1..95b522dc 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -13,6 +13,7 @@ from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur +from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost from bdfr.site_downloaders.youtube import Youtube @@ -44,6 +45,7 @@ ('https://streamable.com/dt46y', YoutubeDlFallback), ('https://vimeo.com/channels/31259/53576664', YoutubeDlFallback), ('http://video.pbs.org/viralplayer/2365173446/', YoutubeDlFallback), + ('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', PornHub), )) def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): result = DownloadFactory.pull_lever(test_submission_url) diff --git a/tests/site_downloaders/test_pornhub.py b/tests/site_downloaders/test_pornhub.py new file mode 100644 index 00000000..12144ddd --- /dev/null +++ b/tests/site_downloaders/test_pornhub.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from unittest.mock import MagicMock + +import pytest + +from bdfr.resource import Resource +from bdfr.site_downloaders.pornhub import PornHub + + +@pytest.mark.online +@pytest.mark.slow +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', '5f5294b9b97dbb7cb9cf8df278515621'), +)) +def test_find_resources_good(test_url: str, expected_hash: str): + test_submission = MagicMock() + test_submission.url = test_url + downloader = PornHub(test_submission) + resources = downloader.find_resources() + assert len(resources) == 1 + assert isinstance(resources[0], Resource) + resources[0].download(120) + assert resources[0].hash.hexdigest() == expected_hash From e8998da2f00ca34cc53e899a2640904b1d5c721b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 24 Jun 2021 20:10:31 +1000 Subject: [PATCH 22/38] Catch some Imgur errors with weird links --- bdfr/site_downloaders/imgur.py | 5 +++-- tests/site_downloaders/test_imgur.py | 14 ++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index bd974be6..44a62f1e 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -37,9 +37,10 @@ def _compute_image_url(self, image: dict) -> Resource: @staticmethod def _get_data(link: str) -> dict: - if re.match(r'.*\.gifv$', link): + link = link.rstrip('?') + if re.match(r'(?i).*\.gifv$', link): link = link.replace('i.imgur', 'imgur') - link = re.sub('\\.gifv$', '', link) + link = re.sub('(?i)\\.gifv$', '', link) res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'}) diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 0e557edb..aa937956 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -132,10 +132,16 @@ def test_imgur_extension_validation_bad(test_extension: str): ), ( 'https://i.imgur.com/lFJai6i.gifv', - ( - '01a6e79a30bec0e644e5da12365d5071', - ), - ) + ('01a6e79a30bec0e644e5da12365d5071',), + ), + ( + 'https://i.imgur.com/ywSyILa.gifv?', + ('56d4afc32d2966017c38d98568709b45',), + ), + ( + 'https://imgur.com/ubYwpbk.GIFV', + ('d4a774aac1667783f9ed3a1bd02fac0c',), + ), )) def test_find_resources(test_url: str, expected_hashes: list[str]): mock_download = Mock() From d53b3b7274554b6750c1fe301f00085933172a51 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 25 Jun 2021 14:00:10 +1000 Subject: [PATCH 23/38] Update gallery code to work with NSFW galleries --- bdfr/site_downloaders/gallery.py | 29 ++++++------ tests/site_downloaders/test_gallery.py | 64 ++++++++++++++------------ 2 files changed, 50 insertions(+), 43 deletions(-) diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index 2c59c05b..b3bae265 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -5,6 +5,7 @@ from typing import Optional import bs4 +import requests from praw.models import Submission from bdfr.exceptions import SiteDownloaderError @@ -20,21 +21,21 @@ def __init__(self, post: Submission): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - image_urls = self._get_links(self.post.url) + image_urls = self._get_links(self.post.gallery_data['items']) if not image_urls: raise SiteDownloaderError('No images found in Reddit gallery') return [Resource(self.post, url) for url in image_urls] - @staticmethod - def _get_links(url: str) -> list[str]: - resource_headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' - ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - } - page = Gallery.retrieve_url(url, headers=resource_headers) - soup = bs4.BeautifulSoup(page.text, 'html.parser') - - links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')}) - links = [link.get('href') for link in links] - return links + @ staticmethod + def _get_links(id_dict: list[dict]) -> list[str]: + out = [] + for item in id_dict: + image_id = item['media_id'] + possible_extensions = ('.jpg', '.png', '.gif', '.gifv', '.jpeg') + for extension in possible_extensions: + test_url = f'https://i.redd.it/{image_id}{extension}' + response = requests.head(test_url) + if response.status_code == 200: + out.append(test_url) + break + return out diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index e903e04c..857f1486 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -8,30 +8,32 @@ @pytest.mark.online -@pytest.mark.parametrize(('test_url', 'expected'), ( - ('https://www.reddit.com/gallery/m6lvrh', { - 'https://preview.redd.it/18nzv9ch0hn61.jpg?width=4160&' - 'format=pjpg&auto=webp&s=470a825b9c364e0eace0036882dcff926f821de8', - 'https://preview.redd.it/jqkizcch0hn61.jpg?width=4160&' - 'format=pjpg&auto=webp&s=ae4f552a18066bb6727676b14f2451c5feecf805', - 'https://preview.redd.it/k0fnqzbh0hn61.jpg?width=4160&' - 'format=pjpg&auto=webp&s=c6a10fececdc33983487c16ad02219fd3fc6cd76', - 'https://preview.redd.it/m3gamzbh0hn61.jpg?width=4160&' - 'format=pjpg&auto=webp&s=0dd90f324711851953e24873290b7f29ec73c444' +@pytest.mark.parametrize(('test_ids', 'expected'), ( + ([ + {'media_id': '18nzv9ch0hn61'}, + {'media_id': 'jqkizcch0hn61'}, + {'media_id': 'k0fnqzbh0hn61'}, + {'media_id': 'm3gamzbh0hn61'}, + ], { + 'https://i.redd.it/18nzv9ch0hn61.jpg', + 'https://i.redd.it/jqkizcch0hn61.jpg', + 'https://i.redd.it/k0fnqzbh0hn61.jpg', + 'https://i.redd.it/m3gamzbh0hn61.jpg' }), - ('https://www.reddit.com/gallery/ljyy27', { - 'https://preview.redd.it/04vxj25uqih61.png?width=92&' - 'format=png&auto=webp&s=6513f3a5c5128ee7680d402cab5ea4fb2bbeead4', - 'https://preview.redd.it/0fnx83kpqih61.png?width=241&' - 'format=png&auto=webp&s=655e9deb6f499c9ba1476eaff56787a697e6255a', - 'https://preview.redd.it/7zkmr1wqqih61.png?width=237&' - 'format=png&auto=webp&s=19de214e634cbcad9959f19570c616e29be0c0b0', - 'https://preview.redd.it/u37k5gxrqih61.png?width=443&' - 'format=png&auto=webp&s=e74dae31841fe4a2545ffd794d3b25b9ff0eb862' + ([ + {'media_id': '04vxj25uqih61'}, + {'media_id': '0fnx83kpqih61'}, + {'media_id': '7zkmr1wqqih61'}, + {'media_id': 'u37k5gxrqih61'}, + ], { + 'https://i.redd.it/04vxj25uqih61.png', + 'https://i.redd.it/0fnx83kpqih61.png', + 'https://i.redd.it/7zkmr1wqqih61.png', + 'https://i.redd.it/u37k5gxrqih61.png' }), )) -def test_gallery_get_links(test_url: str, expected: set[str]): - results = Gallery._get_links(test_url) +def test_gallery_get_links(test_ids: list[dict], expected: set[str]): + results = Gallery._get_links(test_ids) assert set(results) == expected @@ -39,16 +41,20 @@ def test_gallery_get_links(test_url: str, expected: set[str]): @pytest.mark.reddit @pytest.mark.parametrize(('test_submission_id', 'expected_hashes'), ( ('m6lvrh', { - '6c8a892ae8066cbe119218bcaac731e1', - '93ce177f8cb7994906795f4615114d13', - '9a293adf19354f14582608cf22124574', - 'b73e2c3daee02f99404644ea02f1ae65' + '5c42b8341dd56eebef792e86f3981c6a', + '8f38d76da46f4057bf2773a778e725ca', + 'f5776f8f90491c8b770b8e0a6bfa49b3', + 'fa1a43c94da30026ad19a9813a0ed2c2', }), ('ljyy27', { - '1bc38bed88f9c4770e22a37122d5c941', - '2539a92b78f3968a069df2dffe2279f9', - '37dea50281c219b905e46edeefc1a18d', - 'ec4924cf40549728dcf53dd40bc7a73c' + '359c203ec81d0bc00e675f1023673238', + '79262fd46bce5bfa550d878a3b898be4', + '808c35267f44acb523ce03bfa5687404', + 'ec8b65bdb7f1279c4b3af0ea2bbb30c3', + }), + ('nxyahw', { + 'b89a3f41feb73ec1136ec4ffa7353eb1', + 'cabb76fd6fd11ae6e115a2039eb09f04', }), )) def test_gallery_download(test_submission_id: str, expected_hashes: set[str], reddit_instance: praw.Reddit): From 528f5c567db3964cd9a25e549d3dbcbf099df9ac Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 30 Jun 2021 11:59:38 +1000 Subject: [PATCH 24/38] Add additional test for Redgifs --- tests/site_downloaders/test_redgifs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index 71fc18ef..476149fc 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -31,6 +31,7 @@ def test_get_link(test_url: str, expected: str): ('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'), ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', '9bec0a9e4163a43781368ed5d70471df'), ('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', '8afb4e2c090a87140230f2352bf8beba'), + ('https://redgifs.com/watch/leafysaltydungbeetle', '076792c660b9c024c0471ef4759af8bd'), )) def test_download_resource(test_url: str, expected_hash: str): mock_submission = Mock() From ffd07f38ba02286df3400e98267bd7765d263597 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 30 Jun 2021 12:52:27 +1000 Subject: [PATCH 25/38] Fix broken subreddit test --- tests/test_connector.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/test_connector.py b/tests/test_connector.py index 2249b963..e561b969 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -29,6 +29,8 @@ def downloader_mock(args: Configuration): downloader_mock = MagicMock() downloader_mock.args = args downloader_mock.sanitise_subreddit_name = RedditConnector.sanitise_subreddit_name + downloader_mock.create_filtered_listing_generator = lambda x: RedditConnector.create_filtered_listing_generator( + downloader_mock, x) downloader_mock.split_args_input = RedditConnector.split_args_input downloader_mock.master_hash_list = {} return downloader_mock @@ -37,6 +39,7 @@ def downloader_mock(args: Configuration): def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]) -> list: results = [sub for res in results for sub in res] assert all([isinstance(res, praw.models.Submission) for res in results]) + assert not any([isinstance(m, MagicMock) for m in results]) if result_limit is not None: assert len(results) == result_limit return results @@ -167,18 +170,20 @@ def test_get_subreddit_normal( downloader_mock: MagicMock, reddit_instance: praw.Reddit, ): - downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot downloader_mock.args.limit = limit downloader_mock.args.sort = sort_type + downloader_mock.time_filter = RedditConnector.create_time_filter(downloader_mock) + downloader_mock.sort_filter = RedditConnector.create_sort_filter(downloader_mock) + downloader_mock.determine_sort_function.return_value = RedditConnector.determine_sort_function(downloader_mock) downloader_mock.args.subreddit = test_subreddits downloader_mock.reddit_instance = reddit_instance - downloader_mock.sort_filter = RedditConnector.create_sort_filter(downloader_mock) results = RedditConnector.get_subreddits(downloader_mock) - test_subreddits = downloader_mock._split_args_input(test_subreddits) + test_subreddits = downloader_mock.split_args_input(test_subreddits) results = [sub for res1 in results for sub in res1] assert all([isinstance(res1, praw.models.Submission) for res1 in results]) assert all([res.subreddit.display_name in test_subreddits for res in results]) assert len(results) <= max_expected_len + assert not any([isinstance(m, MagicMock) for m in results]) @pytest.mark.online @@ -212,6 +217,7 @@ def test_get_subreddit_search( assert all([isinstance(res, praw.models.Submission) for res in results]) assert all([res.subreddit.display_name in test_subreddits for res in results]) assert len(results) <= max_expected_len + assert not any([isinstance(m, MagicMock) for m in results]) @pytest.mark.online @@ -243,6 +249,7 @@ def test_get_multireddits_public( results = [sub for res in results for sub in res] assert all([isinstance(res, praw.models.Submission) for res in results]) assert len(results) == limit + assert not any([isinstance(m, MagicMock) for m in results]) @pytest.mark.online @@ -268,6 +275,7 @@ def test_get_user_submissions(test_user: str, limit: int, downloader_mock: Magic results = RedditConnector.get_user_data(downloader_mock) results = assert_all_results_are_submissions(limit, results) assert all([res.author.name == test_user for res in results]) + assert not any([isinstance(m, MagicMock) for m in results]) @pytest.mark.online From 469a7783b86ab0759b01276c086a282815731c25 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:00:48 +1000 Subject: [PATCH 26/38] Split integration tests --- tests/integration_tests/__init__.py | 2 + .../test_archive_integration.py | 108 +++++++++++++++ .../test_clone_integration.py | 44 +++++++ .../test_download_integration.py} | 124 +----------------- 4 files changed, 157 insertions(+), 121 deletions(-) create mode 100644 tests/integration_tests/__init__.py create mode 100644 tests/integration_tests/test_archive_integration.py create mode 100644 tests/integration_tests/test_clone_integration.py rename tests/{test_integration.py => integration_tests/test_download_integration.py} (71%) diff --git a/tests/integration_tests/__init__.py b/tests/integration_tests/__init__.py new file mode 100644 index 00000000..d4c1799a --- /dev/null +++ b/tests/integration_tests/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# coding=utf-8 diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py new file mode 100644 index 00000000..8cbb2d55 --- /dev/null +++ b/tests/integration_tests/test_archive_integration.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import re +import shutil +from pathlib import Path + +import pytest +from click.testing import CliRunner + +from bdfr.__main__ import cli + +does_test_config_exist = Path('../test_config.cfg').exists() + + +def copy_test_config(run_path: Path): + shutil.copy(Path('../test_config.cfg'), Path(run_path, '../test_config.cfg')) + + +def create_basic_args_for_archive_runner(test_args: list[str], run_path: Path): + copy_test_config(run_path) + out = [ + 'archive', + str(run_path), + '-v', + '--config', str(Path(run_path, '../test_config.cfg')), + '--log', str(Path(run_path, 'test_log.txt')), + ] + test_args + return out + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'gstd4hk'], + ['-l', 'm2601g', '-f', 'yaml'], + ['-l', 'n60t4c', '-f', 'xml'], +)) +def test_cli_archive_single(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert re.search(r'Writing entry .*? to file in .*? format', result.output) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'Mindustry', '-L', 25], + ['--subreddit', 'Mindustry', '-L', 25, '--format', 'xml'], + ['--subreddit', 'Mindustry', '-L', 25, '--format', 'yaml'], + ['--subreddit', 'Mindustry', '-L', 25, '--sort', 'new'], + ['--subreddit', 'Mindustry', '-L', 25, '--time', 'day'], + ['--subreddit', 'Mindustry', '-L', 25, '--time', 'day', '--sort', 'new'], +)) +def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert re.search(r'Writing entry .*? to file in .*? format', result.output) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--user', 'me', '--authenticate', '--all-comments', '-L', '10'], + ['--user', 'me', '--user', 'djnish', '--authenticate', '--all-comments', '-L', '10'], +)) +def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--comment-context', '--link', 'gxqapql'], +)) +def test_cli_archive_full_context(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Converting comment' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.slow +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'all', '-L', 100], + ['--subreddit', 'all', '-L', 100, '--sort', 'new'], +)) +def test_cli_archive_long(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert re.search(r'Writing entry .*? to file in .*? format', result.output) diff --git a/tests/integration_tests/test_clone_integration.py b/tests/integration_tests/test_clone_integration.py new file mode 100644 index 00000000..84892fc1 --- /dev/null +++ b/tests/integration_tests/test_clone_integration.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import re +import shutil +from pathlib import Path + +import pytest +from click.testing import CliRunner + +from bdfr.__main__ import cli + +does_test_config_exist = Path('../test_config.cfg').exists() + + +def copy_test_config(run_path: Path): + shutil.copy(Path('../test_config.cfg'), Path(run_path, '../test_config.cfg')) + + +def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path): + out = [ + 'clone', + str(tmp_path), + '-v', + '--config', 'test_config.cfg', + '--log', str(Path(tmp_path, 'test_log.txt')), + ] + test_args + return out + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'm2601g'], + ['-s', 'TrollXChromosomes/', '-L', 1], +)) +def test_cli_scrape_general(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_cloner_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Downloaded submission' in result.output + assert 'Record for entry item' in result.output diff --git a/tests/test_integration.py b/tests/integration_tests/test_download_integration.py similarity index 71% rename from tests/test_integration.py rename to tests/integration_tests/test_download_integration.py index 5f1dfea6..fca0f8b6 100644 --- a/tests/test_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -10,11 +10,11 @@ from bdfr.__main__ import cli -does_test_config_exist = Path('test_config.cfg').exists() +does_test_config_exist = Path('../test_config.cfg').exists() def copy_test_config(run_path: Path): - shutil.copy(Path('test_config.cfg'), Path(run_path, 'test_config.cfg')) + shutil.copy(Path('../test_config.cfg'), Path(run_path, '../test_config.cfg')) def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): @@ -22,35 +22,12 @@ def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): out = [ 'download', str(run_path), '-v', - '--config', str(Path(run_path, 'test_config.cfg')), + '--config', str(Path(run_path, '../test_config.cfg')), '--log', str(Path(run_path, 'test_log.txt')), ] + test_args return out -def create_basic_args_for_archive_runner(test_args: list[str], run_path: Path): - copy_test_config(run_path) - out = [ - 'archive', - str(run_path), - '-v', - '--config', str(Path(run_path, 'test_config.cfg')), - '--log', str(Path(run_path, 'test_log.txt')), - ] + test_args - return out - - -def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path): - out = [ - 'clone', - str(tmp_path), - '-v', - '--config', 'test_config.cfg', - '--log', str(Path(tmp_path, 'test_log.txt')), - ] + test_args - return out - - @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @@ -211,85 +188,6 @@ def test_cli_download_long(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['-l', 'gstd4hk'], - ['-l', 'm2601g', '-f', 'yaml'], - ['-l', 'n60t4c', '-f', 'xml'], -)) -def test_cli_archive_single(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_archive_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - assert re.search(r'Writing entry .*? to file in .*? format', result.output) - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['--subreddit', 'Mindustry', '-L', 25], - ['--subreddit', 'Mindustry', '-L', 25, '--format', 'xml'], - ['--subreddit', 'Mindustry', '-L', 25, '--format', 'yaml'], - ['--subreddit', 'Mindustry', '-L', 25, '--sort', 'new'], - ['--subreddit', 'Mindustry', '-L', 25, '--time', 'day'], - ['--subreddit', 'Mindustry', '-L', 25, '--time', 'day', '--sort', 'new'], -)) -def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_archive_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - assert re.search(r'Writing entry .*? to file in .*? format', result.output) - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['--user', 'me', '--authenticate', '--all-comments', '-L', '10'], - ['--user', 'me', '--user', 'djnish', '--authenticate', '--all-comments', '-L', '10'], -)) -def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_archive_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['--comment-context', '--link', 'gxqapql'], -)) -def test_cli_archive_full_context(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_archive_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - assert 'Converting comment' in result.output - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.slow -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['--subreddit', 'all', '-L', 100], - ['--subreddit', 'all', '-L', 100, '--sort', 'new'], -)) -def test_cli_archive_long(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_archive_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - assert re.search(r'Writing entry .*? to file in .*? format', result.output) - - @pytest.mark.online @pytest.mark.reddit @pytest.mark.slow @@ -393,19 +291,3 @@ def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert 'skipped due to disabled module' in result.output assert 'Downloaded submission' not in result.output - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['-l', 'm2601g'], - ['-s', 'TrollXChromosomes/', '-L', 1], -)) -def test_cli_scrape_general(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_cloner_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - assert 'Downloaded submission' in result.output - assert 'Record for entry item' in result.output From edfeb653a4780f4dade685d8bba2ac4e451bc450 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 1 Jul 2021 13:11:31 +1000 Subject: [PATCH 27/38] Record user flair in comment archive entries --- bdfr/archive_entry/base_archive_entry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bdfr/archive_entry/base_archive_entry.py b/bdfr/archive_entry/base_archive_entry.py index 7b84fbe7..516e5d06 100644 --- a/bdfr/archive_entry/base_archive_entry.py +++ b/bdfr/archive_entry/base_archive_entry.py @@ -22,6 +22,7 @@ def _convert_comment_to_dict(in_comment: Comment) -> dict: 'id': in_comment.id, 'score': in_comment.score, 'subreddit': in_comment.subreddit.display_name, + 'author_flair': in_comment.author_flair_text, 'submission': in_comment.submission.id, 'stickied': in_comment.stickied, 'body': in_comment.body, From bd34c37052f7b0bf2e678051fb464a49dbd0adcc Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 30 Jun 2021 12:21:17 +1000 Subject: [PATCH 28/38] Add exception for special friends subreddit --- bdfr/connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index 68efc0c7..8a6f0bf9 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -394,7 +394,7 @@ def download(self): @staticmethod def check_subreddit_status(subreddit: praw.models.Subreddit): - if subreddit.display_name == 'all': + if subreddit.display_name in ('all', 'friends'): return try: assert subreddit.id From c4aa6177372e73a469e58cdc5de57675de69a7a8 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:17:13 +1000 Subject: [PATCH 29/38] Add test for friends subreddit --- .../test_download_integration.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index fca0f8b6..56da1d5b 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -58,6 +58,21 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): assert 'Added submissions from subreddit ' in result.output +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.authenticated +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'friends', '-L', 10, '--authenticate'], +)) +def test_cli_download_user_specific_subreddits(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Added submissions from subreddit ' in result.output + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') From 8db9d0bcc4119d22b3fcb4e6810d737bf981957c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:29:39 +1000 Subject: [PATCH 30/38] Add test for unauthenticated instances --- bdfr/connector.py | 3 +++ tests/integration_tests/test_download_integration.py | 1 + 2 files changed, 4 insertions(+) diff --git a/bdfr/connector.py b/bdfr/connector.py index 8a6f0bf9..a4165fcb 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -242,6 +242,9 @@ def get_subreddits(self) -> list[praw.models.ListingGenerator]: if self.args.subreddit: out = [] for reddit in self.split_args_input(self.args.subreddit): + if reddit == 'friends' and self.authenticated is False: + logger.error('Cannot read friends subreddit without an authenticated instance') + continue try: reddit = self.reddit_instance.subreddit(reddit) try: diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 56da1d5b..4ee0bba4 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -214,6 +214,7 @@ def test_cli_download_long(test_args: list[str], tmp_path: Path): ['--subreddit', 'submitters', '-L', 10], # Private subreddit ['--subreddit', 'donaldtrump', '-L', 10], # Banned subreddit ['--user', 'djnish', '--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10], + ['--subreddit', 'friends', '-L', 10], )) def test_cli_download_soft_fail(test_args: list[str], tmp_path: Path): runner = CliRunner() From 1319eeb6dafcaf4327e65dc9c8faf033c2e0aaf0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:53:02 +1000 Subject: [PATCH 31/38] Fix error with crossposted Reddit galleries --- bdfr/site_downloaders/gallery.py | 11 ++++++++++- tests/site_downloaders/test_gallery.py | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index b3bae265..10704197 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -21,7 +21,16 @@ def __init__(self, post: Submission): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - image_urls = self._get_links(self.post.gallery_data['items']) + try: + image_urls = self._get_links(self.post.gallery_data['items']) + except AttributeError: + try: + image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items']) + except (AttributeError, IndexError): + logger.error(f'Could not find gallery data in submission {self.post.id}') + logger.exception('Gallery image find failure') + raise SiteDownloaderError('No images found in Reddit gallery') + if not image_urls: raise SiteDownloaderError('No images found in Reddit gallery') return [Resource(self.post, url) for url in image_urls] diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index 857f1486..51045f8d 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -56,6 +56,10 @@ def test_gallery_get_links(test_ids: list[dict], expected: set[str]): 'b89a3f41feb73ec1136ec4ffa7353eb1', 'cabb76fd6fd11ae6e115a2039eb09f04', }), + ('obkflw', { + '65163f685fb28c5b776e0e77122718be', + '2a337eb5b13c34d3ca3f51b5db7c13e9', + }), )) def test_gallery_download(test_submission_id: str, expected_hashes: set[str], reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) From 390ce57f461db854d3ff062ded10bdc5fe8d52e3 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:57:10 +1000 Subject: [PATCH 32/38] Remove redundant parenthesis --- tests/site_downloaders/test_youtube.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index afaebb72..f3a97e18 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -28,8 +28,9 @@ def test_find_resources_good(test_url: str, expected_hash: str): @pytest.mark.online -@pytest.mark.parametrize(('test_url'), ( - ('https://www.polygon.com/disney-plus/2020/5/14/21249881/gargoyles-animated-series-disney-plus-greg-weisman-interview-oj-simpson-goliath-chronicles'), +@pytest.mark.parametrize('test_url', ( + 'https://www.polygon.com/disney-plus/2020/5/14/21249881/gargoyles-animated-series-disney-plus-greg-weisman' + '-interview-oj-simpson-goliath-chronicles', )) def test_find_resources_bad(test_url: str): test_submission = MagicMock() From 6efcf1ce7e310a4475e68a83063f897324a93e5b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:58:20 +1000 Subject: [PATCH 33/38] Remove unused imports --- bdfr/site_downloaders/gallery.py | 2 -- bdfr/site_downloaders/pornhub.py | 4 ---- bdfr/site_downloaders/redgifs.py | 1 - tests/integration_tests/test_clone_integration.py | 1 - tests/integration_tests/test_download_integration.py | 1 - tests/test_downloader.py | 1 - 6 files changed, 10 deletions(-) diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index 10704197..df161e52 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -1,10 +1,8 @@ #!/usr/bin/env python3 import logging -import re from typing import Optional -import bs4 import requests from praw.models import Submission diff --git a/bdfr/site_downloaders/pornhub.py b/bdfr/site_downloaders/pornhub.py index 924a6b8f..6658d7e3 100644 --- a/bdfr/site_downloaders/pornhub.py +++ b/bdfr/site_downloaders/pornhub.py @@ -2,14 +2,10 @@ # coding=utf-8 import logging -import tempfile -from pathlib import Path from typing import Optional -import youtube_dl from praw.models import Submission -from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator from bdfr.site_downloaders.youtube import Youtube diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 051bc124..9cfec024 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -4,7 +4,6 @@ import re from typing import Optional -from bs4 import BeautifulSoup from praw.models import Submission from bdfr.exceptions import SiteDownloaderError diff --git a/tests/integration_tests/test_clone_integration.py b/tests/integration_tests/test_clone_integration.py index 84892fc1..343b2d3e 100644 --- a/tests/integration_tests/test_clone_integration.py +++ b/tests/integration_tests/test_clone_integration.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 # coding=utf-8 -import re import shutil from pathlib import Path diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 4ee0bba4..305fe99b 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 # coding=utf-8 -import re import shutil from pathlib import Path diff --git a/tests/test_downloader.py b/tests/test_downloader.py index d67aee62..e5f0a314 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,7 +9,6 @@ import praw.models import pytest -import bdfr.site_downloaders.download_factory from bdfr.__main__ import setup_logging from bdfr.configuration import Configuration from bdfr.connector import RedditConnector From aa55a92791df149023af971670f2f4e9196cd13d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:58:56 +1000 Subject: [PATCH 34/38] Remove unused local variables --- bdfr/connector.py | 2 +- bdfr/site_downloaders/youtube.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index a4165fcb..5628e943 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -210,7 +210,7 @@ def create_file_logger(self): if log_path.exists(): try: file_handler.doRollover() - except PermissionError as e: + except PermissionError: logger.critical( 'Cannot rollover logfile, make sure this is the only ' 'BDFR process or specify alternate logfile location') diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index e12fdc16..8b93b230 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -43,7 +43,6 @@ def _download_video(self, ytdl_options: dict) -> Resource: except youtube_dl.DownloadError as e: raise SiteDownloaderError(f'Youtube download failed: {e}') - downloaded_file = None downloaded_files = list(download_path.iterdir()) if len(downloaded_files) > 0: downloaded_file = downloaded_files[0] From d5ef991b3abe093e8d2a8fcfd6e8251e0c5702f2 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 15:11:09 +1000 Subject: [PATCH 35/38] Catch additional error in galleries --- bdfr/site_downloaders/gallery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index df161e52..62fec603 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -24,7 +24,7 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l except AttributeError: try: image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items']) - except (AttributeError, IndexError): + except (AttributeError, IndexError, TypeError): logger.error(f'Could not find gallery data in submission {self.post.id}') logger.exception('Gallery image find failure') raise SiteDownloaderError('No images found in Reddit gallery') From 7f1c929a080b2462897c5f998ba4fac6b9a60fa4 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 3 Jul 2021 13:54:26 +1000 Subject: [PATCH 36/38] Add fallback scope --- bdfr/connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index 5628e943..1eb91c89 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -119,7 +119,7 @@ def create_reddit_instance(self): logger.debug('Using authenticated Reddit instance') if not self.cfg_parser.has_option('DEFAULT', 'user_token'): logger.log(9, 'Commencing OAuth2 authentication') - scopes = self.cfg_parser.get('DEFAULT', 'scopes') + scopes = self.cfg_parser.get('DEFAULT', 'scopes', fallback='identity, history, read, save') scopes = OAuth2Authenticator.split_scopes(scopes) oauth2_authenticator = OAuth2Authenticator( scopes, From d03a5e556e4699c84ed0f72994c78677a7020944 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 4 Jul 2021 10:59:35 +1000 Subject: [PATCH 37/38] Stop writing new value to config --- bdfr/connector.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index 1eb91c89..d6d43dd7 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -90,10 +90,7 @@ def _setup_internal_objects(self): def read_config(self): """Read any cfg values that need to be processed""" if self.args.max_wait_time is None: - if not self.cfg_parser.has_option('DEFAULT', 'max_wait_time'): - self.cfg_parser.set('DEFAULT', 'max_wait_time', '120') - logger.log(9, 'Wrote default download wait time download to config file') - self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time') + self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time', fallback=120) logger.debug(f'Setting maximum download wait time to {self.args.max_wait_time} seconds') if self.args.time_format is None: option = self.cfg_parser.get('DEFAULT', 'time_format', fallback='ISO') From 2f8ca766c604ff69227bdc69d5c67fba38d01e3d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 4 Jul 2021 11:00:02 +1000 Subject: [PATCH 38/38] Update regex --- bdfr/connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index d6d43dd7..0e78c8c0 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -94,7 +94,7 @@ def read_config(self): logger.debug(f'Setting maximum download wait time to {self.args.max_wait_time} seconds') if self.args.time_format is None: option = self.cfg_parser.get('DEFAULT', 'time_format', fallback='ISO') - if re.match(r'^[ \'\"]*$', option): + if re.match(r'^[\s\'\"]*$', option): option = 'ISO' logger.debug(f'Setting datetime format string to {option}') self.args.time_format = option