From 5bc1097f9d26b51d545492396aa53fa16293d73b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 Jun 2020 23:51:54 +0200 Subject: [PATCH] [twitter] metadata cleanup #2 - remove useless clutter by creating new tweet-data dicts instead of reusing the original Tweet objects - rename fields to how they were named before ('id_str' -> 'tweet_id', etc.) - only include 'author' if it would differ from 'user' - restore 'archive_fmt' --- gallery_dl/extractor/twitter.py | 131 ++++++++++++++++++++++++-------- 1 file changed, 98 insertions(+), 33 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 067898fd10c..1ace136fc8f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -18,9 +18,9 @@ class TwitterExtractor(Extractor): """Base class for twitter extractors""" category = "twitter" - directory_fmt = ("{category}", "{user[screen_name]}") - filename_fmt = "{id_str}_{num}.{extension}" - archive_fmt = "{id_str}_{num}" + directory_fmt = ("{category}", "{user[name]}") + filename_fmt = "{tweet_id}_{num}.{extension}" + archive_fmt = "{tweet_id}_{retweet_id}_{num}" cookiedomain = ".twitter.com" root = "https://twitter.com" sizes = (":orig", ":large", ":medium", ":small") @@ -32,6 +32,7 @@ def __init__(self, match): self.replies = self.config("replies", True) self.twitpic = self.config("twitpic", False) self.videos = self.config("videos", True) + self._user_cache = {} def items(self): self.login() @@ -49,26 +50,23 @@ def items(self): if "extended_entities" not in tweet: continue - tweet.update(metadata) - tweet["date"] = text.parse_datetime( - tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") - entities = tweet["extended_entities"] - del tweet["extended_entities"] - del tweet["entities"] + tdata = self._transform_tweet(tweet) + tdata.update(metadata) - yield Message.Directory, tweet - for tweet["num"], media in enumerate(entities["media"], 1): + yield Message.Directory, tdata + for tdata["num"], media in enumerate( + tweet["extended_entities"]["media"], 1): - tweet["width"] = media["original_info"].get("width", 0) - tweet["height"] = media["original_info"].get("height", 0) + tdata["width"] = media["original_info"].get("width", 0) + tdata["height"] = media["original_info"].get("height", 0) if "video_info" in media and self.videos: if self.videos == "ytdl": url = "ytdl:{}/i/web/status/{}".format( self.root, tweet["id_str"]) - tweet["extension"] = None - yield Message.Url, url, tweet + tdata["extension"] = None + yield Message.Url, url, tdata else: video_info = media["video_info"] @@ -76,24 +74,24 @@ def items(self): video_info["variants"], key=lambda v: v.get("bitrate", 0), ) - tweet["duration"] = video_info.get( + tdata["duration"] = video_info.get( "duration_millis", 0) / 1000 - tweet["bitrate"] = variant.get("bitrate", 0) + tdata["bitrate"] = variant.get("bitrate", 0) url = variant["url"] - text.nameext_from_url(url, tweet) - yield Message.Url, url, tweet + text.nameext_from_url(url, tdata) + yield Message.Url, url, tdata elif "media_url_https" in media: url = media["media_url_https"] urls = [url + size for size in self.sizes] - text.nameext_from_url(url, tweet) - yield Message.Urllist, urls, tweet + text.nameext_from_url(url, tdata) + yield Message.Urllist, urls, tdata else: url = media["media_url"] - text.nameext_from_url(url, tweet) - yield Message.Url, url, tweet + text.nameext_from_url(url, tdata) + yield Message.Url, url, tdata def _extract_twitpic(self, tweet): twitpics = [] @@ -115,6 +113,73 @@ def _extract_twitpic(self, tweet): else: tweet["extended_entities"] = {"media": twitpics} + def _transform_tweet(self, tweet): + entities = tweet["entities"] + tdata = { + "tweet_id" : text.parse_int(tweet["id_str"]), + "retweet_id" : text.parse_int( + tweet.get("retweeted_status_id_str")), + "quote_id" : text.parse_int( + tweet.get("quoted_status_id_str")), + "reply_id" : text.parse_int( + tweet.get("in_reply_to_status_id_str")), + "date" : text.parse_datetime( + tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"), + "user" : self._transform_user(tweet["user"]), + "lang" : tweet["lang"], + "content" : tweet["full_text"], + "favorite_count": tweet["favorite_count"], + "quote_count" : tweet["quote_count"], + "reply_count" : tweet["reply_count"], + "retweet_count" : tweet["retweet_count"], + } + + hashtags = entities.get("hashtags") + if hashtags: + tdata["hashtags"] = [t["text"] for t in hashtags] + + mentions = entities.get("user_mentions") + if mentions: + tdata["mentions"] = [{ + "id": text.parse_int(u["id_str"]), + "name": u["screen_name"], + "nick": u["name"], + } for u in mentions] + + if "full_text_quoted" in tweet: + tdata["content_quoted"] = tweet["full_text_quoted"] + + if "author" in tweet: + tdata["author"] = self._transform_user(tweet["author"]) + + return tdata + + def _transform_user(self, user): + uid = user["id_str"] + cache = self._user_cache + + if uid not in cache: + cache[uid] = { + "id" : text.parse_int(uid), + "name" : user["screen_name"], + "nick" : user["name"], + "description" : user["description"], + "location" : user["location"], + "date" : text.parse_datetime( + user["created_at"], "%a %b %d %H:%M:%S %z %Y"), + "verified" : user.get("verified", False), + "profile_banner" : user.get("profile_banner_url", ""), + "profile_image" : user.get( + "profile_image_url_https", "").replace("_normal.", "."), + "favourites_count": user["favourites_count"], + "followers_count" : user["followers_count"], + "friends_count" : user["friends_count"], + "listed_count" : user["listed_count"], + "media_count" : user["media_count"], + "statuses_count" : user["statuses_count"], + } + return cache[uid] + def metadata(self): """Return general metadata""" return {} @@ -235,7 +300,7 @@ class TwitterTweetExtractor(TwitterExtractor): }), # content with emoji, newlines, hashtags (#338) ("https://twitter.com/playpokemon/status/1263832915173048321", { - "keyword": {"full_text": ( + "keyword": {"content": ( r"re:Gear up for #PokemonSwordShieldEX with special Mystery " "Gifts! \n\nYou’ll be able to receive four Galarian form " "Pokémon with Hidden Abilities, plus some very useful items. " @@ -418,16 +483,16 @@ def _pagination(self, endpoint, params=None, tweet["user"] = users[tweet["user_id_str"]] if "quoted_status_id_str" in tweet: - quoted = tweets[tweet["quoted_status_id_str"]] - tweet["author"] = tweet["user"] - if "extended_entities" in quoted: - tweet["extended_entities"] = \ - quoted["extended_entities"] + quoted = tweets.get(tweet["quoted_status_id_str"]) + if quoted: + tweet["full_text_quoted"] = quoted["full_text"] + if "extended_entities" in quoted: + tweet["extended_entities"] = \ + quoted["extended_entities"] elif "retweeted_status_id_str" in tweet: - retweet = tweets[tweet["retweeted_status_id_str"]] - tweet["author"] = users[retweet["user_id_str"]] - else: - tweet["author"] = tweet["user"] + retweet = tweets.get(tweet["retweeted_status_id_str"]) + if retweet: + tweet["author"] = users[retweet["user_id_str"]] yield tweet