diff --git a/docs/configuration.rst b/docs/configuration.rst index 022e3c5268..432bc6e9fb 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2376,6 +2376,31 @@ Description for example ``tags_artist`` or ``tags_character``. +extractor.facebook.author-followups +----------------------------------- +Type + ``bool`` +Default + ``false`` +description + Extract comments that include photo attachments made by the author of the post. + + +extractor.facebook.videos +------------------------- +Type + * ``bool`` + * ``string`` +Default + ``true`` +Description + Control video download behavior. + + * ``true``: Extract and download video & audio separately. + * ``"ytdl"``: Let |ytdl| handle video extraction and download, and merge video & audio streams. + * ``false``: Ignore videos. + + extractor.fanbox.comments ------------------------- Type diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1c59a0ff0d..f779217e18 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -247,6 +247,12 @@ Consider all listed sites to potentially be NSFW. Favorites, Galleries, Search Results Supported + + Facebook + https://www.facebook.com/ + Photos, Profiles, Sets, Videos + Cookies + Fanleaks https://fanleaks.club/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 8b0a158a4c..594ce41a96 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -50,6 +50,7 @@ "erome", "everia", "exhentai", + "facebook", "fanbox", "fanleaks", "fantia", diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py new file mode 100644 index 0000000000..a8b5690f24 --- /dev/null +++ b/gallery_dl/extractor/facebook.py @@ -0,0 +1,447 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.facebook.com/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com" + + +class FacebookExtractor(Extractor): + """Base class for Facebook extractors""" + category = "facebook" + root = "https://www.facebook.com" + directory_fmt = ("{category}", "{username}", "{title} ({set_id})") + filename_fmt = "{id}.{extension}" + archive_fmt = "{id}.{extension}" + + set_url_fmt = root + "/media/set/?set={set_id}" + photo_url_fmt = root + "/photo/?fbid={photo_id}&set={set_id}" + + def _init(self): + headers = self.session.headers + headers["Accept"] = ( + "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8" + ) + headers["Sec-Fetch-Dest"] = "empty" + headers["Sec-Fetch-Mode"] = "navigate" + headers["Sec-Fetch-Site"] = "same-origin" + + self.fallback_retries = self.config("fallback-retries", 2) + self.videos = self.config("videos", True) + self.author_followups = self.config("author-followups", False) + + @staticmethod + def decode_all(txt): + return text.unescape( + txt.encode("utf-8").decode("unicode_escape") + ).replace("\\/", "/") + + @staticmethod + def parse_set_page(set_page): + directory = { + "set_id": text.extr( + set_page, '"mediaSetToken":"', '"' + ) or text.extr( + set_page, '"mediasetToken":"', '"' + ), + "username": FacebookExtractor.decode_all( + text.extr( + set_page, '"user":{"__isProfile":"User","name":"', '","' + ) or text.extr( + set_page, '"actors":[{"__typename":"User","name":"', '","' + ) + ), + "user_id": text.extr( + set_page, '"owner":{"__typename":"User","id":"', '"' + ), + "title": FacebookExtractor.decode_all(text.extr( + set_page, '"title":{"text":"', '"' + )), + "first_photo_id": text.extr( + set_page, + '{"__typename":"Photo","__isMedia":"Photo","', + '","creation_story"' + ).rsplit('"id":"', 1)[-1] or + text.extr( + set_page, '{"__typename":"Photo","id":"', '"' + ) + } + + return directory + + @staticmethod + def parse_photo_page(photo_page): + photo = { + "id": text.extr( + photo_page, '"__isNode":"Photo","id":"', '"' + ), + "set_id": text.extr( + photo_page, + '"url":"https:\\/\\/www.facebook.com\\/photo\\/?fbid=', + '"' + ).rsplit("&set=", 1)[-1], + "username": FacebookExtractor.decode_all(text.extr( + photo_page, '"owner":{"__typename":"User","name":"', '"' + )), + "user_id": text.extr( + photo_page, '"owner":{"__typename":"User","id":"', '"' + ), + "caption": FacebookExtractor.decode_all(text.extr( + photo_page, + '"message":{"delight_ranges"', + '"},"message_preferred_body"' + ).rsplit('],"text":"', 1)[-1]), + "date": text.parse_timestamp(text.extr( + photo_page, '\\"publish_time\\":', ',' + )), + "url": FacebookExtractor.decode_all(text.extr( + photo_page, ',"image":{"uri":"', '","' + )), + "next_photo_id": text.extr( + photo_page, + '"nextMediaAfterNodeId":{"__typename":"Photo","id":"', + '"' + ) or text.extr( + photo_page, + '"nextMedia":{"edges":[{"node":{"__typename":"Photo","id":"', + '"' + ) + } + + text.nameext_from_url(photo["url"], photo) + + photo["followups_ids"] = [] + for comment_raw in text.extract_iter( + photo_page, '{"node":{"id"', '"cursor":null}' + ): + if ('"is_author_original_poster":true' in comment_raw and + '{"__typename":"Photo","id":"' in comment_raw): + photo["followups_ids"].append(text.extr( + comment_raw, + '{"__typename":"Photo","id":"', + '"' + )) + + return photo + + @staticmethod + def parse_post_page(post_page): + first_photo_url = text.extr( + text.extr( + post_page, '"__isMedia":"Photo"', '"target_group"' + ), '"url":"', ',' + ) + + post = { + "set_id": text.extr(post_page, '{"mediaset_token":"', '"') or + text.extr(first_photo_url, 'set=', '"').rsplit("&", 1)[0] + } + + return post + + @staticmethod + def parse_video_page(video_page): + video = { + "id": text.extr( + video_page, '\\"video_id\\":\\"', '\\"' + ), + "username": FacebookExtractor.decode_all(text.extr( + video_page, '"actors":[{"__typename":"User","name":"', '","' + )), + "user_id": text.extr( + video_page, '"owner":{"__typename":"User","id":"', '"' + ), + "date": text.parse_timestamp(text.extr( + video_page, '\\"publish_time\\":', ',' + )), + "type": "video" + } + + if not video["username"]: + video["username"] = FacebookExtractor.decode_all(text.extr( + video_page, + '"__typename":"User","id":"' + video["user_id"] + '","name":"', + '","' + )) + + first_video_raw = text.extr( + video_page, '"permalink_url"', '\\/Period>\\u003C\\/MPD>' + ) + + audio = { + **video, + "url": FacebookExtractor.decode_all(text.extr( + text.extr( + first_video_raw, + "AudioChannelConfiguration", + "BaseURL>\\u003C" + ), + "BaseURL>", "\\u003C\\/" + )), + "type": "audio" + } + + video["urls"] = {} + + for raw_url in text.extract_iter( + first_video_raw, 'FBQualityLabel=\\"', '\\u003C\\/BaseURL>' + ): + resolution = raw_url.split('\\"', 1)[0] + video["urls"][resolution] = FacebookExtractor.decode_all( + raw_url.split('BaseURL>', 1)[1] + ) + + if not video["urls"]: + return video, audio + + video["url"] = max( + video["urls"].items(), + key=lambda x: text.parse_int(x[0][:-1]) + )[1] + + text.nameext_from_url(video["url"], video) + audio["filename"] = video["filename"] + audio["extension"] = "m4a" + + return video, audio + + def photo_page_request_wrapper(self, url, **kwargs): + LEFT_OFF_TXT = "" if url.endswith("&set=") else ( + "\nYou can use this URL to continue from " + "where you left off (added \"&setextract\"): " + "\n" + url + "&setextract" + ) + + res = self.request(url, **kwargs) + + if res.url.startswith(self.root + "/login"): + raise exception.AuthenticationError( + "You must be logged in to continue viewing images." + + LEFT_OFF_TXT + ) + + if b'{"__dr":"CometErrorRoot.react"}' in res.content: + raise exception.StopExtraction( + "You've been temporarily blocked from viewing images. " + "\nPlease try using a different account, " + "using a VPN or waiting before you retry." + + LEFT_OFF_TXT + ) + + return res + + def extract_set(self, first_photo_id, set_id): + all_photo_ids = [first_photo_id] + + retries = 0 + i = 0 + + while i < len(all_photo_ids): + photo_id = all_photo_ids[i] + photo_url = self.photo_url_fmt.format( + photo_id=photo_id, set_id=set_id + ) + photo_page = self.photo_page_request_wrapper(photo_url).text + + photo = self.parse_photo_page(photo_page) + photo["set_id"] = set_id + photo["num"] = i + 1 + + if self.author_followups: + for followup_id in photo["followups_ids"]: + if followup_id not in all_photo_ids: + self.log.debug( + "Found a followup in comments: %s", followup_id + ) + all_photo_ids.append(followup_id) + + if not photo["url"]: + if retries < self.fallback_retries and self._interval_429: + seconds = self._interval_429() + self.log.warning( + "Failed to find photo download URL for %s. " + "Retrying in %s seconds.", photo_url, seconds, + ) + self.wait(seconds=seconds, reason="429 Too Many Requests") + retries += 1 + continue + else: + self.log.error( + "Failed to find photo download URL for " + photo_url + + ". Skipping." + ) + retries = 0 + else: + retries = 0 + yield Message.Url, photo["url"], photo + + if photo["next_photo_id"] == "": + self.log.debug( + "Can't find next image in the set. " + "Extraction is over." + ) + elif photo["next_photo_id"] in all_photo_ids: + if photo["next_photo_id"] != photo["id"]: + self.log.debug( + "Detected a loop in the set, it's likely finished. " + "Extraction is over." + ) + else: + all_photo_ids.append(photo["next_photo_id"]) + + i += 1 + + +class FacebookSetExtractor(FacebookExtractor): + """Base class for Facebook Set extractors""" + subcategory = "set" + pattern = ( + BASE_PATTERN + + r"/(?:(?:media/set|photo)/?\?(?:[^&#]+&)*set=([^&#]+)" + r"[^/?#]*(?