From 7819b9e287ada10f062a413387288387312974bc Mon Sep 17 00:00:00 2001 From: sixinchfootlong <87159307+sixinchfootlong@users.noreply.github.com> Date: Sat, 16 Sep 2023 23:32:31 -0400 Subject: [PATCH 1/4] Fix Bunkr extractor Fixing the extractor since the recent site redesign. Download URLs can be obtained using additional HTTP requests (slow) or constructing them from other information on the page (fast). --- gallery_dl/extractor/bunkr.py | 91 ++++++++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 12 deletions(-) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 1318d0e45dd..3c72a835734 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -8,9 +8,11 @@ """Extractors for https://bunkrr.su/""" +import re + from .lolisafe import LolisafeAlbumExtractor from .. import text -from urllib.parse import urlsplit, urlunsplit +from urllib.parse import urlparse, urlsplit, urlunsplit MEDIA_DOMAIN_OVERRIDES = { "cdn9.bunkr.ru" : "c9.bunkr.ru", @@ -29,7 +31,17 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): category = "bunkr" root = "https://bunkrr.su" pattern = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)/a/([^/?#]+)" - example = "https://bunkrr.su/a/ID" + example = "https://bunkrr.su/a/PoXwD1oA" + + def _get_download_url(self, media_url): + if media_url.startswith("/"): + media_url = self.root + media_url + # The download URL is in the first after the last

. + # Media preview pages only have one

but other pages have two. + html = self.request(media_url).text + header_pos = html.rindex(" containing the class "grid-images_box" + r"^ (\s*) ]+ \bgrid-images_box\b" + # The contents of the
, ungreedy + r".*?" + # A closing
at the same indentation as the opening
+ r"^ \1
", + page, + re.DOTALL | re.MULTILINE | re.VERBOSE + ) + for match in grid_tiles: + html = match.group() + url = None + + # The whole URL and path for viewing a single media. + media_url = text.extr(html, 'href="', '"') + + # Only get the CDN hostname once as it causes extra HTTP requests. + # We could get each media's download URL this way but doing so + # results in getting rate-limited, blocked, or a CAPTCHA page. + if cdn is None: + url = self._get_download_url(media_url) + cdn = text.root_from_url(url) + self.log.debug(f"Using CDN URL: {cdn}") + + # We can assemble the correct download URL for media files using: + # 1. The CDN hostname (e.g. "https://nugget.bunkr.ru") + # 2. The thumbnail file name (e.g. "File-1--rIrVIhmb.png") + # 3. The original file name (e.g. "File (1).mp4") + # The thumbnail file name has the sanitized file name and file ID + # but we need the file extension from the original file name. + thumbnail_url = text.extr(html, 'src="', '"') + if "no-image.svg" not in thumbnail_url: + thumbnail_url = None + + details = re.findall(r"]+> (.*?)

", html, re.VERBOSE) + try: + original_name = details[0].strip() + except IndexError: + original_name = None + + media_path = urlparse(media_url).path + if media_path.startswith("/d/"): + # Download pages already have the file name and ID in the URL. + # However, their album tiles usually have a placeholder + # thumbnail because the file has no preview. + # e.g. "/d/Resources-iwizfcKl.url" + download_path = text.filename_from_url(media_path) + url = f"{cdn}/{download_path}" + + elif media_path.startswith("/i/") or media_path.startswith("/v/"): + # For media preview pages, derive the download URL. + if thumbnail_url and original_name: + thumb_name = text.filename_from_url(thumbnail_url) + thumb_base, _, thumb_ext = thumb_name.rpartition(".") + orig_base, _, orig_ext = original_name.rpartition(".") + url = f"{cdn}/{thumb_base}.{orig_ext}" + + # If we still don't have a download URL, use the slow method. + # This is always required for MP3 files as they use a `/v/` media + # path like videos but don't have a preview thumbnail. + if not url: + url = self._get_download_url(media_url) url = text.unescape(url) if url.lower().endswith(CDN_HOSTED_EXTENSIONS): From a3767a0f256b6af555a9be4a29485f5fbefe5195 Mon Sep 17 00:00:00 2001 From: sixinchfootlong <87159307+sixinchfootlong@users.noreply.github.com> Date: Sat, 16 Sep 2023 23:37:59 -0400 Subject: [PATCH 2/4] Fix Python 3.5 compatibility --- gallery_dl/extractor/bunkr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 3c72a835734..571c9029ad0 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -106,7 +106,7 @@ def fetch_album(self, album_id): # thumbnail because the file has no preview. # e.g. "/d/Resources-iwizfcKl.url" download_path = text.filename_from_url(media_path) - url = f"{cdn}/{download_path}" + url = "{}/{}".format(cdn, download_path) elif media_path.startswith("/i/") or media_path.startswith("/v/"): # For media preview pages, derive the download URL. @@ -114,7 +114,7 @@ def fetch_album(self, album_id): thumb_name = text.filename_from_url(thumbnail_url) thumb_base, _, thumb_ext = thumb_name.rpartition(".") orig_base, _, orig_ext = original_name.rpartition(".") - url = f"{cdn}/{thumb_base}.{orig_ext}" + url = "{}/{}.{}".format(cdn, thumb_base, orig_ext) # If we still don't have a download URL, use the slow method. # This is always required for MP3 files as they use a `/v/` media From 9c042f66e72c54f4f3dcf6fbbe85c8f71790f025 Mon Sep 17 00:00:00 2001 From: sixinchfootlong <87159307+sixinchfootlong@users.noreply.github.com> Date: Sat, 16 Sep 2023 23:40:02 -0400 Subject: [PATCH 3/4] Fix Python 3.5 compatibility --- gallery_dl/extractor/bunkr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 571c9029ad0..ab704bf218d 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -81,7 +81,7 @@ def fetch_album(self, album_id): if cdn is None: url = self._get_download_url(media_url) cdn = text.root_from_url(url) - self.log.debug(f"Using CDN URL: {cdn}") + self.log.debug("Using CDN URL: {}".format(cdn)) # We can assemble the correct download URL for media files using: # 1. The CDN hostname (e.g. "https://nugget.bunkr.ru") From d43a8fffec901b7efa131cb9cba4e9d298a29bf1 Mon Sep 17 00:00:00 2001 From: sixinchfootlong <87159307+sixinchfootlong@users.noreply.github.com> Date: Sun, 17 Sep 2023 00:39:54 -0400 Subject: [PATCH 4/4] Fix missing thumbnail logic --- gallery_dl/extractor/bunkr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index ab704bf218d..5e05e3d3b08 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -90,7 +90,7 @@ def fetch_album(self, album_id): # The thumbnail file name has the sanitized file name and file ID # but we need the file extension from the original file name. thumbnail_url = text.extr(html, 'src="', '"') - if "no-image.svg" not in thumbnail_url: + if "no-image.svg" in thumbnail_url: thumbnail_url = None details = re.findall(r"]+> (.*?)

", html, re.VERBOSE)